├── common ├── __init__.py ├── memory_tracker.py └── utils.py ├── models └── .gitkeep ├── logs ├── llama │ └── .gitkeep └── mistral │ └── .gitkeep ├── bench_lightning ├── requirements.txt ├── setup.sh ├── bench.py └── bench.sh ├── image.png ├── bench_candle ├── requirements.txt ├── llama2-candle │ └── Cargo.toml ├── setup.sh ├── README.md ├── convert_to_safetensors.py └── bench.sh ├── bench_deepspeed ├── requirements.txt ├── setup.sh ├── bench.py └── bench.sh ├── bench_exllamav2 ├── requirements.txt ├── setup.sh ├── bench.py └── bench.sh ├── bench_ctransformers ├── requirements.txt ├── bench.py ├── setup.sh └── bench.sh ├── bench_ctranslate ├── requirements.txt ├── setup.sh ├── bench.py └── bench.sh ├── setup.cfg ├── bench_llamacpp ├── requirements.txt ├── setup.sh ├── bench.py └── bench.sh ├── bench_autoawq ├── requirements.txt ├── setup.sh ├── bench.py └── bench.sh ├── bench_pytorch ├── requirements.txt ├── setup.sh ├── bench.sh └── bench.py ├── bench_autogptq ├── requirements.txt ├── setup.sh ├── bench.sh └── bench.py ├── .github └── workflows │ └── precommit.yaml ├── models.json ├── .pre-commit-config.yaml ├── LICENSE ├── bench_optimum_nvidia ├── converter.py ├── setup.sh ├── bench.py └── bench.sh ├── bench_onnxruntime ├── setup.sh ├── bench.sh └── bench.py ├── benchmark.sh ├── download.sh ├── questions.json ├── docs ├── archive.md └── ml_engines.md ├── .gitignore ├── bench_tensorrtllm ├── bench.sh └── bench.py └── bench_vllm ├── bench.py ├── bench.sh └── setup.sh /common/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /models/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /logs/llama/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /logs/mistral/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /bench_lightning/requirements.txt: -------------------------------------------------------------------------------- 1 | scipy==1.13.0 2 | bitsandbytes==0.43.1 3 | -------------------------------------------------------------------------------- /image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/premAI-io/benchmarks/HEAD/image.png -------------------------------------------------------------------------------- /bench_candle/requirements.txt: -------------------------------------------------------------------------------- 1 | torch==2.1.0 2 | safetensors==0.4.0 3 | numpy==1.26.2 4 | -------------------------------------------------------------------------------- /bench_deepspeed/requirements.txt: -------------------------------------------------------------------------------- 1 | deepspeed-mii==0.2.3 2 | mpi4py==3.1.5 3 | sentencepiece==0.2.0 4 | -------------------------------------------------------------------------------- /bench_exllamav2/requirements.txt: -------------------------------------------------------------------------------- 1 | exllamav2==0.0.19 2 | tqdm==4.66.2 3 | tokenizers==0.15.2 4 | transformers==4.40.0 5 | -------------------------------------------------------------------------------- /bench_ctransformers/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.26.4 2 | huggingface-hub>=0.17.1 3 | transformers==4.38.2 4 | torch==2.2.2 5 | -------------------------------------------------------------------------------- /bench_ctranslate/requirements.txt: -------------------------------------------------------------------------------- 1 | sentencepiece==0.1.99 2 | ctranslate2==4.1.0 3 | transformers==4.35.0 4 | torch==2.1.0 5 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 120 3 | exclude = .tox,.git,*/migrations/*,*/static/CACHE/*,docs,node_modules,venv 4 | -------------------------------------------------------------------------------- /bench_llamacpp/requirements.txt: -------------------------------------------------------------------------------- 1 | llama_cpp_python==0.2.62 2 | huggingface_hub==0.22.2 3 | transformers==4.39.3 4 | torch==2.2.2 5 | -------------------------------------------------------------------------------- /bench_autoawq/requirements.txt: -------------------------------------------------------------------------------- 1 | torch==2.2.2 2 | accelerate==0.28.0 3 | transformers==4.38.2 4 | optimum==1.18.0 5 | autoawq==0.2.4 6 | autoawq-kernels==0.0.6 7 | -------------------------------------------------------------------------------- /bench_pytorch/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers==4.39.3 2 | torch==2.2.2 3 | accelerate==0.28.0 4 | sentencepiece==0.2.0 5 | protobuf==0.2.0 6 | bitsandbytes==0.43.1 7 | -------------------------------------------------------------------------------- /bench_autogptq/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.26.4 2 | gekko==1.1.0 3 | pandas==2.2.1 4 | huggingface_hub==0.22.2 5 | torch==2.2.1 6 | transformers==4.38.2 7 | fsspec[http]>=2023.1.0,<=2024.2.0 8 | -------------------------------------------------------------------------------- /.github/workflows/precommit.yaml: -------------------------------------------------------------------------------- 1 | name: pre-commit 2 | 3 | on: 4 | pull_request: 5 | branches: [main, dev] 6 | push: 7 | branches: [main, dev] 8 | 9 | jobs: 10 | pre-commit: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v3 14 | - uses: actions/setup-python@v3 15 | - uses: pre-commit/action@v3.0.0 16 | -------------------------------------------------------------------------------- /models.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "url": "https://prem-models.s3.eu-central-1.amazonaws.com/llama-v2/Llama-2-7b-chat-hf.zip", 4 | "file": "llama-2-7b-chat-hf.zip", 5 | "folder": "./models/llama-2-7b-chat-hf" 6 | }, 7 | { 8 | "url": "https://prem-models.s3.eu-central-1.amazonaws.com/mistral-0.1/Mistral-7B-Instruct-v0.1.zip", 9 | "file": "mistral-7b-v0.1-instruct.zip", 10 | "folder": "./models/mistral-7b-v0.1-instruct-hf" 11 | } 12 | ] 13 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | default_stages: [commit] 2 | 3 | repos: 4 | - repo: https://github.com/pre-commit/pre-commit-hooks 5 | rev: v4.5.0 6 | hooks: 7 | - id: trailing-whitespace 8 | - id: end-of-file-fixer 9 | - id: check-yaml 10 | - id: end-of-file-fixer 11 | - id: check-toml 12 | - id: check-xml 13 | - id: debug-statements 14 | - id: check-builtin-literals 15 | - id: check-case-conflict 16 | 17 | - repo: https://github.com/psf/black 18 | rev: 23.11.0 19 | hooks: 20 | - id: black 21 | 22 | - repo: https://github.com/PyCQA/isort 23 | rev: 5.12.0 24 | hooks: 25 | - id: isort 26 | 27 | - repo: https://github.com/PyCQA/flake8 28 | rev: 6.1.0 29 | hooks: 30 | - id: flake8 31 | args: ["--config=setup.cfg"] 32 | additional_dependencies: [flake8-isort] 33 | 34 | - repo: https://github.com/shellcheck-py/shellcheck-py 35 | rev: v0.9.0.6 36 | hooks: 37 | - id: shellcheck 38 | 39 | ci: 40 | autoupdate_schedule: weekly 41 | skip: [] 42 | submodules: false 43 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Prem 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /bench_candle/llama2-candle/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "llama2-candle" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [dependencies] 9 | accelerate-src = { version = "0.3.2", optional = true } 10 | anyhow = { version = "1", features = ["backtrace"] } 11 | candle = { version = "0.5.1", package = "candle-core" } 12 | candle-examples = { version = "0.5.1", package = "candle-examples" } 13 | candle-nn = { version = "0.5.1", package = "candle-nn" } 14 | candle-transformers = { version = "0.5.1", package = "candle-transformers" } 15 | clap = { version = "4.2.4", features = ["derive"] } 16 | env_logger = "0.10.0" 17 | hf-hub = "0.3.2" 18 | imageproc = { version = "0.23.0", default-features = false } 19 | log = "0.4" 20 | rand = "0.8.5" 21 | rusttype = { version = "0.9", default-features = false } 22 | serde_json = "1.0.99" 23 | tokenizers = { version = "0.19.1", features = ["onig"] } 24 | tracing-chrome = "0.7.1" 25 | tracing-subscriber = "0.3.7" 26 | 27 | [features] 28 | accelerate = ["dep:accelerate-src", "candle/accelerate", "candle-nn/accelerate", "candle-transformers/accelerate"] 29 | cuda = ["candle/cuda", "candle-nn/cuda", "candle-transformers/cuda"] 30 | -------------------------------------------------------------------------------- /bench_deepspeed/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ################################################################################ 4 | # Script: setup.sh 5 | # Description: Automates the setup of a virtual environment and installs project 6 | # requirements. 7 | ################################################################################ 8 | 9 | set -euo pipefail 10 | 11 | # Main script starts here. 12 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" 13 | VENV_DIR="$SCRIPT_DIR/venv" 14 | 15 | check_python() { 16 | if command -v python &> /dev/null; then 17 | PYTHON_CMD="python" 18 | elif command -v python3 &> /dev/null; then 19 | PYTHON_CMD="python3" 20 | else 21 | echo "Python is not installed." 22 | exit 1 23 | fi 24 | } 25 | 26 | check_python 27 | 28 | if [ ! -d "$VENV_DIR" ]; then 29 | "$PYTHON_CMD" -m venv "$VENV_DIR" 30 | echo "Virtual environment '$VENV_DIR' created." 31 | # shellcheck disable=SC1091 32 | source "$VENV_DIR/bin/activate" 33 | "$PYTHON_CMD" -m pip install --upgrade pip > /dev/null 34 | "$PYTHON_CMD" -m pip install -r "$SCRIPT_DIR/requirements.txt" --no-cache-dir > /dev/null 35 | else 36 | # shellcheck disable=SC1091 37 | source "$VENV_DIR/bin/activate" 38 | fi 39 | -------------------------------------------------------------------------------- /bench_pytorch/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ################################################################################ 4 | # Script: setup.sh 5 | # Description: Automates the setup of a virtual environment and installs project 6 | # requirements. 7 | ################################################################################ 8 | 9 | set -euo pipefail 10 | 11 | # Main script starts here. 12 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" 13 | VENV_DIR="$SCRIPT_DIR/venv" 14 | 15 | check_python() { 16 | if command -v python &> /dev/null; then 17 | PYTHON_CMD="python" 18 | elif command -v python3 &> /dev/null; then 19 | PYTHON_CMD="python3" 20 | else 21 | echo "Python is not installed." 22 | exit 1 23 | fi 24 | } 25 | 26 | check_python 27 | 28 | if [ ! -d "$VENV_DIR" ]; then 29 | "$PYTHON_CMD" -m venv "$VENV_DIR" 30 | echo "Virtual environment '$VENV_DIR' created." 31 | # shellcheck disable=SC1091 32 | source "$VENV_DIR/bin/activate" 33 | "$PYTHON_CMD" -m pip install --upgrade pip > /dev/null 34 | "$PYTHON_CMD" -m pip install -r "$SCRIPT_DIR/requirements.txt" --no-cache-dir > /dev/null 35 | else 36 | # shellcheck disable=SC1091 37 | source "$VENV_DIR/bin/activate" 38 | fi 39 | -------------------------------------------------------------------------------- /bench_candle/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ################################################################################ 4 | # Script: setup.sh 5 | # Description: This script automates the setup of a virtual environment, 6 | # installs project requirements, converts model. 7 | ################################################################################ 8 | 9 | set -euo pipefail 10 | 11 | if [ "$#" -ne 1 ]; then 12 | echo "Usage: $0 " 13 | exit 1 14 | fi 15 | 16 | # Define directory paths 17 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 18 | VENV_DIR="$SCRIPT_DIR/venv" 19 | MODELS_FOLDER="$1" 20 | LLAMA_HF_MODEL_DIR="$MODELS_FOLDER/llama-2-7b-hf" 21 | LLAMA_ST_MODEL_DIR="$MODELS_FOLDER/llama-2-7b-st" 22 | 23 | if [ ! -d "$VENV_DIR" ]; then 24 | python -m venv "$VENV_DIR" 25 | echo "Virtual environment '$VENV_DIR' created." 26 | # shellcheck disable=SC1091 27 | source "$VENV_DIR"/bin/activate 28 | pip install --upgrade pip > /dev/null 29 | pip install -r "$SCRIPT_DIR"/requirements.txt > /dev/null 30 | else 31 | # shellcheck disable=SC1091 32 | source "$VENV_DIR"/bin/activate 33 | fi 34 | 35 | if [ ! -d "$LLAMA_ST_MODEL_DIR" ]; then 36 | echo "Storing llama-2-7b-hf in safetensors format..." 37 | python "$SCRIPT_DIR"/convert_to_safetensors.py --input_dir "$LLAMA_HF_MODEL_DIR" --output_dir "$LLAMA_ST_MODEL_DIR" 38 | else 39 | echo "Model llama-2-7b-hf in safetensors format already exists!" 40 | fi 41 | -------------------------------------------------------------------------------- /bench_optimum_nvidia/converter.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import os 4 | 5 | import torch 6 | from optimum.nvidia import AutoModelForCausalLM 7 | 8 | # Some points to note: 9 | # - the conversion is super simple, and it assumes batch size to be 1 and 10 | # num beams to be 1 11 | # - it also assumes a standard prompt length of 512 tokens 12 | 13 | 14 | def build_engine(hf_model_path: str, out_model_dir: str, torch_dtype: str): 15 | if not os.path.isdir(out_model_dir): 16 | os.makedirs(out_model_dir, exist_ok=True) 17 | 18 | dtype_mapper = {"float16": torch.float16, "float32": torch.float32} 19 | 20 | try: 21 | logging.info("Starting to build the model engine") 22 | model = AutoModelForCausalLM.from_pretrained( 23 | pretrained_model_name_or_path=hf_model_path, 24 | max_batch_size=1, 25 | max_prompt_length=512, 26 | num_beams=1, 27 | torch_dtype=dtype_mapper[torch_dtype], 28 | ) 29 | 30 | model.save_pretrained(save_directory=out_model_dir) 31 | except Exception as e: 32 | logging.info(f"Error: {e}") 33 | os.rmdir(out_model_dir) 34 | 35 | 36 | if __name__ == "__main__": 37 | parser = argparse.ArgumentParser("HF Optimum builder engine CLI") 38 | parser.add_argument( 39 | "--hf_dir", 40 | type=str, 41 | help="Hugging Face model weights path", 42 | ) 43 | 44 | parser.add_argument( 45 | "--out_dir", 46 | type=str, 47 | help="The output engine dir", 48 | ) 49 | 50 | parser.add_argument( 51 | "--dtype", 52 | type=str, 53 | help="The precision in which it will be saved. Supported: 'float16' and 'float32", 54 | ) 55 | 56 | args = parser.parse_args() 57 | build_engine( 58 | hf_model_path=args.hf_dir, out_model_dir=args.out_dir, torch_dtype=args.dtype 59 | ) 60 | -------------------------------------------------------------------------------- /bench_candle/README.md: -------------------------------------------------------------------------------- 1 | # Candle 2 | 3 | [![GitHub Repo](https://img.shields.io/badge/github-%23121011.svg?style=for-the-badge&logo=github&logoColor=white)](https://github.com/huggingface/candle)   4 | 5 | [Candle](https://github.com/huggingface/candle) is a minimalistic Machine/Deep Learning framework written on Rust by [huggingface](https://github.com/huggingface). It tries to provide a simpler interface to implement models along with GPU support. This is a modified implementation of [Llama2-Candle example](https://github.com/huggingface/candle/blob/main/candle-examples/examples/llama/main.rs) to analyse the benchmark performance across different devices and precision. 6 | 7 | 8 | ### 🚀 Running the Candle Benchmark. 9 | 10 | For running this benchmark, make sure you have [Rust installed](https://www.rust-lang.org/tools/install). You can run the Candle benchmark using the following command: 11 | 12 | ```bash 13 | ./bench_candle/bench.sh \ 14 | --prompt \ # Enter a prompt string 15 | --max_tokens \ # Maximum number of tokens to output 16 | --repetitions \ # Number of repititions to be made for the prompt. 17 | --log_file \ # A .log file underwhich we want to write the results. 18 | --device \ # The device in which we want to benchmark. 19 | --models_dir # The directory in which model weights are present 20 | ``` 21 | 22 | To get started quickly you can simply run: 23 | 24 | ```bash 25 | ./bench_candle/bench.sh -d cuda 26 | ``` 27 | This will take all the default values (see in the [bench.sh](/bench_candle/bench.sh) file) and perform the benchmarks. You can find all the benchmarks results for Candle [here](/docs/llama2.md). 28 | 29 | 30 | ### 👀 Some points to note: 31 | 32 | 1. Running this benchmark requires [HuggingFace Llama2-7B weights](https://huggingface.co/meta-llama/Llama-2-7b). So running this benchmark would assume that you already agreed to the required [terms and conditions](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) and got verified to download the weights. 33 | 2. Candle does not have support for Metal devices. 34 | 3. Candles does support [quantized models](https://github.com/huggingface/candle/blob/main/candle-examples/examples/quantized/main.rs). The benchmarks for quantized candles model will be available in the next versions. 35 | -------------------------------------------------------------------------------- /bench_optimum_nvidia/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ################################################################################ 4 | # Script: setup.sh 5 | # Description: Automates the setup of a virtual environment and installs project 6 | # requirements. 7 | ################################################################################ 8 | 9 | set -euo pipefail 10 | 11 | # Main script starts here. 12 | 13 | CURRENT_DIR="$(pwd)" 14 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 15 | 16 | check_docker() { 17 | if command -v docker &> /dev/null; then 18 | return 0 19 | else 20 | return 1 21 | fi 22 | } 23 | 24 | 25 | build_docker_image () { 26 | # Check if the Docker image exists 27 | if docker image inspect huggingface/optimum-nvidia:latest &> /dev/null; then 28 | echo "Image 'huggingface/optimum-nvidia:latest' already exists." 29 | else 30 | docker pull huggingface/optimum-nvidia:latest 31 | fi 32 | } 33 | 34 | build_and_compile_model () { 35 | echo "Running and building the model inside Docker..." 36 | local MODEL_NAME="$1" 37 | local PRECISION="$2" 38 | 39 | # Set the default folder paths for HF and engines 40 | LLAMA2_WEIGHTS_FOLDER="/mnt/models/llama-2-7b-chat" 41 | MISTRAL_WEIGHTS_FOLDER="/mnt/models/mistral-7b-v0.1-instruct" 42 | 43 | if [ "$MODEL_NAME" = "llama" ]; then 44 | HF_DIR="$LLAMA2_WEIGHTS_FOLDER-hf" 45 | ENGINE_DIR="$LLAMA2_WEIGHTS_FOLDER-optimum-$PRECISION" 46 | OUT_DIR="$CURRENT_DIR/models/llama-2-7b-chat-optimum-$PRECISION" 47 | 48 | elif [ "$MODEL_NAME" = "mistral" ]; then 49 | HF_DIR="$MISTRAL_WEIGHTS_FOLDER-hf" 50 | ENGINE_DIR="$MISTRAL_WEIGHTS_FOLDER-optimum-$PRECISION" 51 | OUT_DIR="$CURRENT_DIR/models/mistral-7b-v0.1-instruct-optimum-$PRECISION" 52 | else 53 | echo "Invalid MODEL_NAME. Supported values: 'llama', 'mistral'" 54 | exit 1 55 | fi 56 | 57 | if [ ! -d "$OUT_DIR" ]; then 58 | docker run --gpus all \ 59 | --ipc=host \ 60 | --ulimit memlock=-1 \ 61 | --ulimit stack=67108864 \ 62 | -v "$CURRENT_DIR"/models:/mnt/models \ 63 | -v "$SCRIPT_DIR/converter.py":/mnt/converter.py \ 64 | huggingface/optimum-nvidia:latest \ 65 | python3 /mnt/converter.py --hf_dir "$HF_DIR" --out_dir "$ENGINE_DIR" --dtype "$PRECISION" 66 | else 67 | echo "Engine file already exists" 68 | fi 69 | 70 | } 71 | 72 | 73 | MODEL_NAME="${1:-"llama"}" 74 | 75 | if check_docker; then 76 | build_docker_image 77 | build_and_compile_model "$MODEL_NAME" "float32" 78 | build_and_compile_model "$MODEL_NAME" "float16" 79 | else 80 | echo "Docker is not installed or not in the PATH" 81 | fi 82 | -------------------------------------------------------------------------------- /common/memory_tracker.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | from contextlib import contextmanager 4 | from multiprocessing import Pipe, Process 5 | from multiprocessing.connection import Connection 6 | 7 | # Adapted from optimum-benchmark, I don't trust pytorch peak memory memory info when external libs are used. 8 | # source: https://github.com/huggingface/optimum/blob/main/tests/benchmark/benchmark_gptq.py 9 | 10 | 11 | class MemoryTracker: 12 | def __init__(self): 13 | self.peak_memory: int = 0 14 | self.device_index = int( 15 | os.environ.get("CUDA_VISIBLE_DEVICES", "0,").split(",")[0] 16 | ) 17 | os.environ["TOKENIZERS_PARALLELISM"] = "true" 18 | 19 | @contextmanager 20 | def track(self, interval: float = 0.1): 21 | print(f"Tracking memory for device {self.device_index}") 22 | yield from self._track_peak_memory(interval) 23 | 24 | def _track_peak_memory(self, interval: float): 25 | child_connection, parent_connection = Pipe() 26 | # instantiate process 27 | mem_process: Process = PeakMemoryMeasureProcess( 28 | self.device_index, child_connection, interval 29 | ) 30 | mem_process.start() 31 | # wait until we get memory 32 | parent_connection.recv() 33 | yield 34 | # start parent connection 35 | parent_connection.send(0) 36 | # receive peak memory 37 | self.peak_memory = parent_connection.recv() 38 | 39 | 40 | class PeakMemoryMeasureProcess(Process): 41 | def __init__( 42 | self, device_index: int, child_connection: Connection, interval: float 43 | ): 44 | super().__init__() 45 | self.device_index = device_index 46 | self.interval = interval 47 | self.connection = child_connection 48 | self.mem_usage = 0 49 | 50 | def run(self): 51 | self.connection.send(0) 52 | stop = False 53 | 54 | command = ( 55 | f"nvidia-smi --query-gpu=memory.used --format=csv --id={self.device_index}" 56 | ) 57 | 58 | while True: 59 | # py3nvml is broken since it outputs only the reserved memory, and nvidia-smi has only the MiB precision. 60 | gpu_mem_mb = ( 61 | subprocess.check_output(command.split()) 62 | .decode("ascii") 63 | .split("\n")[1] 64 | .split()[0] 65 | ) 66 | gpu_mem_mb = int(gpu_mem_mb) * 1.048576 67 | self.mem_usage = max(self.mem_usage, gpu_mem_mb) 68 | 69 | if stop: 70 | break 71 | stop = self.connection.poll(self.interval) 72 | 73 | # send results to parent pipe 74 | self.connection.send(self.mem_usage) 75 | self.connection.close() 76 | -------------------------------------------------------------------------------- /bench_llamacpp/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ################################################################################ 4 | # Script: setup.sh 5 | # Description: Automates the setup of a virtual environment and installs project 6 | # requirements. 7 | ################################################################################ 8 | 9 | set -euo pipefail 10 | 11 | check_python() { 12 | if command -v python &> /dev/null; then 13 | PYTHON_CMD="python" 14 | elif command -v python3 &> /dev/null; then 15 | PYTHON_CMD="python3" 16 | else 17 | echo "Python is not installed." 18 | exit 1 19 | fi 20 | } 21 | 22 | clone_and_build_llama() { 23 | local DEVICE="$1" 24 | local VENV_DIR="$2" 25 | local SCRIPT_DIR="$3" 26 | 27 | if [ "$#" -ne 3 ]; then 28 | echo "Usage: $0 " 29 | exit 1 30 | fi 31 | 32 | case "$DEVICE" in 33 | cuda) 34 | export LLAMA_CUBLAS=on 35 | ;; 36 | metal) 37 | export LLAMA_METAL=on 38 | ;; 39 | cpu) 40 | return 0 41 | ;; 42 | *) 43 | echo "Unsupported DEVICE: $DEVICE" 44 | return 1 45 | ;; 46 | esac 47 | 48 | local LIBLLAMA_FILE="$VENV_DIR/libllama_$DEVICE.so" 49 | 50 | if [ -e "$LIBLLAMA_FILE" ]; then 51 | echo "File $LIBLLAMA_FILE exists." 52 | exit 0 53 | fi 54 | 55 | # Remove existing llama.cpp directory if it exists 56 | if [ -d "$SCRIPT_DIR/llama.cpp" ]; then 57 | echo "Removing existing llama.cpp directory..." 58 | rm -rf "$SCRIPT_DIR"/llama.cpp 59 | fi 60 | 61 | git clone --depth=1 https://github.com/ggerganov/llama.cpp "$SCRIPT_DIR"/llama.cpp 62 | cd "$SCRIPT_DIR"/llama.cpp 63 | 64 | # Build llama.cpp 65 | make clean > /dev/null 66 | echo "Building llama.cpp..." 67 | make libllama.so > /dev/null 68 | cp libllama.so "$LIBLLAMA_FILE" 69 | cd "$SCRIPT_DIR" 70 | 71 | rm -rf "$SCRIPT_DIR"/llama.cpp 72 | } 73 | 74 | # CLI Args 75 | DEVICE="$1" 76 | 77 | # Define directory paths 78 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" 79 | VENV_DIR="$SCRIPT_DIR/venv" 80 | LIBLLAMA_FILE="$VENV_DIR/libllama_$DEVICE.so" 81 | 82 | check_python 83 | 84 | if [ ! -d "$VENV_DIR" ]; then 85 | "$PYTHON_CMD" -m venv "$VENV_DIR" 86 | echo "Virtual environment '$VENV_DIR' created." 87 | # shellcheck disable=SC1091 88 | source "$VENV_DIR/bin/activate" 89 | pip install --upgrade pip > /dev/null 90 | pip install -r "$SCRIPT_DIR/requirements.txt" --no-cache-dir > /dev/null 91 | else 92 | # shellcheck disable=SC1091 93 | source "$VENV_DIR/bin/activate" 94 | fi 95 | 96 | clone_and_build_llama "$DEVICE" "$VENV_DIR" "$SCRIPT_DIR" 97 | -------------------------------------------------------------------------------- /bench_lightning/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ################################################################################ 4 | # Script: setup.sh 5 | # Description: Automates the setup of a virtual environment and installs project 6 | # requirements. 7 | ################################################################################ 8 | 9 | CURRENT_DIR="$(pwd)" 10 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" 11 | VENV_DIR="$SCRIPT_DIR/venv" 12 | 13 | check_python() { 14 | if command -v python &> /dev/null; then 15 | PYTHON_CMD="python" 16 | elif command -v python3 &> /dev/null; then 17 | PYTHON_CMD="python3" 18 | else 19 | echo "Python is not installed." 20 | exit 1 21 | fi 22 | } 23 | 24 | 25 | setup_environment() { 26 | if [ ! -d "$VENV_DIR" ]; then 27 | "$PYTHON_CMD" -m venv "$VENV_DIR" 28 | echo "Virtual environment '$VENV_DIR' created." 29 | # shellcheck disable=SC1091 30 | source "$VENV_DIR/bin/activate" 31 | pip install --upgrade pip > /dev/null 32 | 33 | # install everything 34 | pip install 'litgpt[all] @ git+https://github.com/Lightning-AI/litgpt' 35 | pip install -r "$SCRIPT_DIR/requirements.txt" --no-cache-dir > /dev/null 36 | echo "Successfully installed lit-gpt and it's dependencies" 37 | else 38 | # shellcheck disable=SC1091 39 | source "$VENV_DIR/bin/activate" 40 | fi 41 | } 42 | 43 | convert_hf_to_litgpt() { 44 | local MODEL_NAME="$1" 45 | 46 | # This trick is done because LitGPT expects specific folder name / checkpoint_dir 47 | # Llama-2-7b-chat-hf or Mistral-7B-Instruct-v0.1 48 | TEMP_DIR="" 49 | LITGPT_DIR="" 50 | BACK_TO_DIR="" 51 | 52 | if [ "$MODEL_NAME" = "llama" ]; then 53 | TEMP_DIR="$CURRENT_DIR/models/Llama-2-7b-chat-hf" 54 | LITGPT_DIR="$CURRENT_DIR/models/llama-2-7b-chat-litgpt" 55 | BACK_TO_DIR="$CURRENT_DIR/models/llama-2-7b-chat-hf" 56 | elif [ "$MODEL_NAME" = "mistral" ]; then 57 | TEMP_DIR="$CURRENT_DIR/models/Mistral-7B-Instruct-v0.1" 58 | LITGPT_DIR="$CURRENT_DIR/models/mistral-7b-v0.1-instruct-litgpt" 59 | BACK_TO_DIR="$CURRENT_DIR/models/mistral-7b-v0.1-instruct-hf" 60 | else 61 | echo "Invalid MODEL_NAME. Supported values: 'llama', 'mistral'" 62 | exit 1 63 | fi 64 | 65 | if [ -d "$LITGPT_DIR" ]; then 66 | echo "Already converted" 67 | exit 0 68 | else 69 | mv "$BACK_TO_DIR" "$TEMP_DIR" 70 | mkdir -p "$LITGPT_DIR" 71 | litgpt convert to_litgpt --checkpoint_dir "$TEMP_DIR" 72 | mv "$TEMP_DIR/model_config.yaml" "$TEMP_DIR/lit_model.pth" "$LITGPT_DIR/" 73 | cp -r "$TEMP_DIR/tokenizer.model" "$TEMP_DIR/tokenizer_config.json" "$LITGPT_DIR/" 74 | mv "$TEMP_DIR" "$BACK_TO_DIR" 75 | fi 76 | } 77 | 78 | 79 | MODEL_NAME="$1" 80 | 81 | check_python 82 | setup_environment 83 | convert_hf_to_litgpt "$MODEL_NAME" 84 | -------------------------------------------------------------------------------- /bench_onnxruntime/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ################################################################################ 4 | # Script: setup.sh 5 | # Description: Automates the setup of a virtual environment and installs project 6 | # requirements. 7 | ################################################################################ 8 | 9 | set -euo pipefail 10 | 11 | # Main script starts here. 12 | 13 | CURRENT_DIR="$(pwd)" 14 | 15 | check_docker() { 16 | if command -v docker &> /dev/null; then 17 | return 0 18 | else 19 | return 1 20 | fi 21 | } 22 | 23 | 24 | build_docker_image () { 25 | # Check if the Docker image exists 26 | if docker image inspect anindyadeep/onnxruntime:latest &> /dev/null; then 27 | echo "Image 'anindyadeep/onnxruntime:latest' already exists." 28 | else 29 | docker pull anindyadeep/onnxruntime:latest 30 | fi 31 | } 32 | 33 | build_and_compile_model () { 34 | echo "Running and building the model inside Docker..." 35 | local MODEL_NAME="$1" 36 | local PRECISION="$2" 37 | local DEVICE="$3" 38 | 39 | # Set the default folder paths for HF and engines 40 | LLAMA2_WEIGHTS_FOLDER="/mnt/models/llama-2-7b-chat" 41 | MISTRAL_WEIGHTS_FOLDER="/mnt/models/mistral-7b-v0.1-instruct" 42 | 43 | if [ "$MODEL_NAME" = "llama" ]; then 44 | HF_DIR="$LLAMA2_WEIGHTS_FOLDER-hf" 45 | ENGINE_DIR="$LLAMA2_WEIGHTS_FOLDER-onnx-$PRECISION" 46 | OUT_DIR="$CURRENT_DIR/models/llama-2-7b-chat-onnx-$PRECISION" 47 | 48 | elif [ "$MODEL_NAME" = "mistral" ]; then 49 | HF_DIR="$MISTRAL_WEIGHTS_FOLDER-hf" 50 | ENGINE_DIR="$MISTRAL_WEIGHTS_FOLDER-onnx-$PRECISION" 51 | OUT_DIR="$CURRENT_DIR/models/mistral-7b-v0.1-instruct-onnx-$PRECISION" 52 | else 53 | echo "Invalid MODEL_NAME. Supported values: 'llama', 'mistral'" 54 | exit 1 55 | fi 56 | 57 | if [ "$PRECISION" = "float32" ]; then 58 | ONNX_PRECISION="fp32" 59 | elif [ "$PRECISION" = "float16" ]; then 60 | ONNX_PRECISION="fp16" 61 | else 62 | echo "Supported precision: 'float32' and 'float16'" 63 | exit 1 64 | fi 65 | 66 | if [ ! -d "$OUT_DIR" ]; then 67 | docker run --gpus all \ 68 | --ipc=host \ 69 | --ulimit memlock=-1 \ 70 | --ulimit stack=67108864 \ 71 | -v "$CURRENT_DIR"/models:/mnt/models \ 72 | anindyadeep/onnxruntime:latest \ 73 | optimum-cli export onnx --model "$HF_DIR" \ 74 | --task text-generation --framework pt \ 75 | --opset 17 --sequence_length 1024 \ 76 | --batch_size 1 --device "$DEVICE" \ 77 | --dtype "$ONNX_PRECISION" "$ENGINE_DIR" 78 | else 79 | echo "Engine file already exists" 80 | fi 81 | 82 | } 83 | 84 | 85 | MODEL_NAME="${1:-"llama"}" 86 | DEVICE="$2" 87 | 88 | if check_docker; then 89 | build_docker_image 90 | build_and_compile_model "$MODEL_NAME" "float32" "$DEVICE" 91 | build_and_compile_model "$MODEL_NAME" "float16" "$DEVICE" 92 | else 93 | echo "Docker is not installed or not in the PATH" 94 | fi 95 | -------------------------------------------------------------------------------- /benchmark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | print_usage() { 5 | echo "Usage: $0 [OPTIONS]" 6 | echo "OPTIONS:" 7 | echo " -p, --prompt Prompt for benchmarks (default: 'Explain what is a transformer')" 8 | echo " -r, --repetitions Number of repetitions for benchmarks (default: 10)" 9 | echo " -m, --max_tokens Maximum number of tokens for benchmarks (default: 100)" 10 | echo " -d, --device Device for benchmarks (possible values: 'metal', 'gpu', and 'cpu', default: 'cpu')" 11 | echo " -lf, --log_file Logging file name." 12 | echo " -md, --models_dir Models directory." 13 | echo " -h, --help Show this help message" 14 | exit 1 15 | } 16 | 17 | 18 | download_models() { 19 | echo -e "\nDownloading models..." 20 | bash ./download.sh --models models.json --cache cache.log 21 | } 22 | 23 | check_jq() { 24 | if ! command -v jq &> /dev/null 25 | then 26 | echo -e "\njq is not installed." 27 | exit 1 28 | fi 29 | } 30 | 31 | # Parse command-line arguments 32 | while [ "$#" -gt 0 ]; do 33 | case "$1" in 34 | -p|--prompt) 35 | PROMPT="$2" 36 | shift 2 37 | ;; 38 | -r|--repetitions) 39 | REPETITIONS="$2" 40 | shift 2 41 | ;; 42 | -m|--max_tokens) 43 | MAX_TOKENS="$2" 44 | shift 2 45 | ;; 46 | -d|--device) 47 | DEVICE="$2" 48 | case "$DEVICE" in 49 | "cuda" | "metal" | "cpu") 50 | ;; 51 | *) 52 | echo "Invalid value for --device. Please use 'cuda', 'gpu' or 'cpu'." 53 | print_usage 54 | ;; 55 | esac 56 | shift 2 57 | ;; 58 | -lf|--log_file) 59 | LOG_FILENAME="$2" 60 | shift 2 61 | ;; 62 | -md|--models_dir) 63 | MODELS_DIR="$2" 64 | shift 2 65 | ;; 66 | -h|--help) 67 | print_usage 68 | ;; 69 | *) 70 | echo "Unknown option: $1" 71 | print_usage 72 | ;; 73 | esac 74 | done 75 | 76 | check_jq 77 | download_models 78 | 79 | 80 | PROMPT="${PROMPT:-"Explain what is a transformer"}" 81 | REPETITIONS="${REPETITIONS:-10}" 82 | MAX_TOKENS="${MAX_TOKENS:-100}" 83 | DEVICE="${DEVICE:-'cpu'}" 84 | LOG_FILENAME="${LOG_FILENAME:-"benchmark_$(date +'%Y%m%d%H%M%S').log"}" 85 | MODELS_DIR="${MODELS_DIR:-"./models"}" 86 | 87 | folders=$(find . -type d -name "bench_*") 88 | 89 | for folder in $folders; do 90 | if [ -d "$folder" ]; then 91 | echo "Running benchmark $folder/bench.sh..." 92 | 93 | if ! bash "$folder/bench.sh" \ 94 | --prompt "$PROMPT" \ 95 | --repetitions "$REPETITIONS" \ 96 | --max_tokens "$MAX_TOKENS" \ 97 | --models_dir "$MODELS_DIR" \ 98 | --log_file "$LOG_FILENAME" \ 99 | --device "$DEVICE"; then 100 | echo "Error: Something went wrong in $folder/bench.sh" 101 | else 102 | echo "Success: $folder/bench.sh completed successfully" 103 | fi 104 | fi 105 | done 106 | -------------------------------------------------------------------------------- /bench_autoawq/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ################################################################################ 4 | # Script: setup.sh 5 | # Description: Automates the setup of a virtual environment and installs project 6 | # requirements. 7 | ################################################################################ 8 | 9 | set -euo pipefail 10 | 11 | # Main script starts here. 12 | CURRENT_DIR="$(pwd)" 13 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 14 | VENV_DIR="$SCRIPT_DIR/venv" 15 | 16 | # Set default folder paths for AWQ weights 17 | LLAMA2_AWQ_WEIGHTS_FOLDER="$CURRENT_DIR/models/llama-2-7b-chat-autoawq" 18 | MISTRAL_AWQ_WEIGHTS_FOLDER="$CURRENT_DIR/models/mistral-7b-v0.1-instruct-autoawq" 19 | 20 | check_python() { 21 | if command -v python &> /dev/null; then 22 | PYTHON_CMD="python" 23 | elif command -v python3 &> /dev/null; then 24 | PYTHON_CMD="python3" 25 | else 26 | echo "Python is not installed." 27 | exit 1 28 | fi 29 | } 30 | 31 | download_awq_weights() { 32 | local MODEL_NAME="$1" 33 | 34 | # Set download directory based on MODEL_NAME 35 | if [ "$MODEL_NAME" = "llama" ]; then 36 | DOWNLOAD_DIR="$LLAMA2_AWQ_WEIGHTS_FOLDER" 37 | MODEL_IDENTIFIER="TheBloke/Llama-2-7B-Chat-AWQ" 38 | elif [ "$MODEL_NAME" = "mistral" ]; then 39 | DOWNLOAD_DIR="$MISTRAL_AWQ_WEIGHTS_FOLDER" 40 | MODEL_IDENTIFIER="TheBloke/Mistral-7B-Instruct-v0.1-AWQ" 41 | else 42 | echo "Invalid MODEL_NAME. Supported values: 'llama', 'mistral'" 43 | exit 1 44 | fi 45 | 46 | # Check if weights folder exists 47 | echo "$DOWNLOAD_DIR" 48 | 49 | if [ ! -d "$DOWNLOAD_DIR" ]; then 50 | # Download weights using huggingface-cli 51 | echo "Downloading weights to $DOWNLOAD_DIR..." 52 | huggingface-cli download "$MODEL_IDENTIFIER" --local-dir "$DOWNLOAD_DIR" --exclude "*.git*" "*.md" "Notice" "LICENSE" 53 | else 54 | echo "Weights already downloaded" 55 | fi 56 | } 57 | 58 | check_python 59 | 60 | if [ ! -d "$VENV_DIR" ]; then 61 | "$PYTHON_CMD" -m venv "$VENV_DIR" 62 | echo "Virtual environment '$VENV_DIR' created." 63 | 64 | # Activate virtual environment using specified activation scripts 65 | if [ -f "$VENV_DIR/bin/activate" ]; then 66 | # shellcheck disable=SC1091 67 | source "$VENV_DIR/bin/activate" 68 | elif [ -f "$VENV_DIR/Scripts/activate" ]; then 69 | # shellcheck disable=SC1091 70 | source "$VENV_DIR/Scripts/activate" 71 | else 72 | echo "Error: Unable to find virtual environment activation script." 73 | exit 1 74 | fi 75 | 76 | "$PYTHON_CMD" -m pip install --upgrade pip > /dev/null 77 | "$PYTHON_CMD" -m pip install -r "$SCRIPT_DIR/requirements.txt" --no-cache-dir > /dev/null 78 | else 79 | # Activate virtual environment using specified activation scripts 80 | if [ -f "$VENV_DIR/bin/activate" ]; then 81 | # shellcheck disable=SC1091 82 | source "$VENV_DIR/bin/activate" 83 | elif [ -f "$VENV_DIR/Scripts/activate" ]; then 84 | # shellcheck disable=SC1091 85 | source "$VENV_DIR/Scripts/activate" 86 | else 87 | echo "Error: Unable to find virtual environment activation script." 88 | exit 1 89 | fi 90 | fi 91 | 92 | 93 | MODEL_NAME="${1:-"llama"}" # Use the first argument as MODEL_NAME if provided 94 | download_awq_weights "$MODEL_NAME" 95 | -------------------------------------------------------------------------------- /bench_exllamav2/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ################################################################################ 4 | # Script: setup.sh 5 | # Description: Automates the setup of a virtual environment and installs project 6 | # requirements. 7 | ################################################################################ 8 | 9 | # Define directory paths 10 | CURRENT_DIR="$(pwd)" 11 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" 12 | VENV_DIR="$SCRIPT_DIR/venv" 13 | 14 | # Make the default dirs 15 | LLAMA2_EXLLAMA_WEIGHTS_FOLDER="$CURRENT_DIR/models/llama-2-7b-chat-exllamav2" 16 | MISTRAL_EXLLAMA_WEIGHTS_FOLDER="$CURRENT_DIR/models/mistral-7b-v0.1-instruct-exllamav2" 17 | 18 | check_python() { 19 | if command -v python &> /dev/null; then 20 | PYTHON_CMD="python" 21 | elif command -v python3 &> /dev/null; then 22 | PYTHON_CMD="python3" 23 | else 24 | echo "Python is not installed." 25 | exit 1 26 | fi 27 | } 28 | 29 | 30 | setup_exllamav2_and_quantize() { 31 | local MODEL_NAME="$1" 32 | local QUANTIZATION="$2" 33 | 34 | if [ "$MODEL_NAME" = "llama" ]; then 35 | EXLLAMA_WEIGHTS_FOLDER="$LLAMA2_EXLLAMA_WEIGHTS_FOLDER-$QUANTIZATION-bit" 36 | HF_WEIGHTS_FOLDER="$CURRENT_DIR/models/llama-2-7b-chat-hf" 37 | elif [ "$MODEL_NAME" = "mistral" ]; then 38 | EXLLAMA_WEIGHTS_FOLDER="$MISTRAL_EXLLAMA_WEIGHTS_FOLDER-$QUANTIZATION-bit" 39 | HF_WEIGHTS_FOLDER="$CURRENT_DIR/models/mistral-7b-v0.1-instruct-hf" 40 | else 41 | echo "Invalid MODEL_NAME. Supported values: 'llama', 'mistral'" 42 | exit 1 43 | fi 44 | 45 | # do the conversion if the ExLlamaV2 46 | if [ -d "$EXLLAMA_WEIGHTS_FOLDER" ] && [ "$(ls -A "$EXLLAMA_WEIGHTS_FOLDER")" ]; then 47 | echo "EXLLAMA_WEIGHTS_FOLDER already exists and is not empty." 48 | else 49 | # clone the repo, if not exists 50 | if [ -d "$SCRIPT_DIR/exllamav2" ]; then 51 | echo "exllamav2 folder already exists." 52 | else 53 | git clone https://github.com/turboderp/exllamav2.git "$SCRIPT_DIR/exllamav2" 54 | fi 55 | 56 | mkdir -p "$EXLLAMA_WEIGHTS_FOLDER" 57 | echo "Going for conversion to exllamav2 format from .safetensors in $QUANTIZATION bit quantization." 58 | "$PYTHON_CMD" "$SCRIPT_DIR/exllamav2/convert.py" \ 59 | -i "$HF_WEIGHTS_FOLDER" \ 60 | -o "$EXLLAMA_WEIGHTS_FOLDER" \ 61 | -cf "$EXLLAMA_WEIGHTS_FOLDER" \ 62 | -b "$QUANTIZATION" 63 | 64 | # once done sync with other folders 65 | rm -rf "$EXLLAMA_WEIGHTS_FOLDER/out_tensor" 66 | rsync -av --exclude='*.safetensors' --exclude='.*' --exclude='*.bin' "$HF_WEIGHTS_FOLDER" "$EXLLAMA_WEIGHTS_FOLDER" 67 | fi 68 | 69 | # Delete ExllamaV2 repo 70 | rm -rf "$SCRIPT_DIR/exllamav2" 71 | } 72 | 73 | 74 | check_python 75 | 76 | # CLI Args 77 | MODEL_NAME="$1" 78 | 79 | if [ ! -d "$VENV_DIR" ]; then 80 | "$PYTHON_CMD" -m venv "$VENV_DIR" 81 | echo "Virtual environment '$VENV_DIR' created." 82 | # shellcheck disable=SC1091 83 | source "$VENV_DIR/bin/activate" 84 | pip install --upgrade pip > /dev/null 85 | pip install -r "$SCRIPT_DIR/requirements.txt" --no-cache-dir > /dev/null 86 | else 87 | # shellcheck disable=SC1091 88 | source "$VENV_DIR/bin/activate" 89 | fi 90 | 91 | echo "Converting HuggingFace Llama2 model pytorch .bin file to .safetensors format" 92 | 93 | setup_exllamav2_and_quantize "$MODEL_NAME" 4.0 94 | setup_exllamav2_and_quantize "$MODEL_NAME" 8.0 95 | -------------------------------------------------------------------------------- /bench_deepspeed/bench.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import mii 5 | from transformers import AutoTokenizer 6 | 7 | sys.path.append(os.getcwd()) 8 | 9 | from common.base import BaseBenchmarkClass # noqa 10 | from common.utils import launch_cli, make_report # noqa 11 | 12 | 13 | class DeepSpeedBenchmark(BaseBenchmarkClass): 14 | def __init__( 15 | self, 16 | model_path: str, 17 | model_name: str, 18 | benchmark_name: str, 19 | precision: str, 20 | device: str, 21 | experiment_name: str, 22 | ) -> None: 23 | super().__init__( 24 | model_path=model_path, 25 | model_name=model_name, 26 | benchmark_name=benchmark_name, 27 | precision=precision, 28 | device=device, 29 | experiment_name=experiment_name, 30 | ) 31 | 32 | assert precision == "float16", ValueError( 33 | "Precision other than 'float16' is not supported in DeepSpeed" 34 | ) 35 | assert device == "cuda", ValueError( 36 | "Supported device is only cuda for DeepSpeed" 37 | ) 38 | 39 | def load_model_and_tokenizer(self): 40 | self.model = mii.pipeline(self.model_path) 41 | self.tokenizer = AutoTokenizer.from_pretrained(self.model_path) 42 | return self 43 | 44 | def preprocess( 45 | self, prompt: str, chat_mode: bool = True, for_benchmarks: bool = True 46 | ): 47 | if chat_mode: 48 | template = self.get_chat_template_with_instruction( 49 | prompt=prompt, for_benchmarks=for_benchmarks 50 | ) 51 | prompt = self.tokenizer.apply_chat_template(template, tokenize=False) 52 | 53 | tokenized_input = self.tokenizer.encode(text=prompt) 54 | 55 | return { 56 | "prompt": prompt, 57 | "input_tokens": tokenized_input, 58 | "tensor": None, 59 | "num_input_tokens": len(tokenized_input), 60 | } 61 | 62 | def run_model(self, inputs: dict, max_tokens: int, temperature: float) -> dict: 63 | prompt = inputs["prompt"] 64 | output = self.model( 65 | [prompt], max_new_tokens=max_tokens, temperature=temperature 66 | )[0].generated_text 67 | 68 | output_tokens = self.tokenizer.encode(text=output) 69 | return { 70 | "output_prompt": output, 71 | "output_tokens": output_tokens, 72 | "num_output_tokens": len(output_tokens), 73 | } 74 | 75 | def postprocess(self, output: dict) -> str: 76 | return output["output_prompt"] 77 | 78 | 79 | if __name__ == "__main__": 80 | parser = launch_cli(description="DeepSpeed Benchmark.") 81 | args = parser.parse_args() 82 | 83 | model_folder = os.path.join(os.getcwd(), "models") 84 | model_name = ( 85 | f"{args.model_name}-2-7b-chat-hf" 86 | if args.model_name == "llama" 87 | else f"{args.model_name}-7b-v0.1-instruct-hf" 88 | ) 89 | 90 | runner_dict = { 91 | "cuda": [ 92 | { 93 | "precision": "float16", 94 | "model_path": os.path.join(model_folder, model_name), 95 | } 96 | ] 97 | } 98 | 99 | make_report( 100 | args=args, 101 | benchmark_class=DeepSpeedBenchmark, 102 | runner_dict=runner_dict, 103 | benchmark_name="DeepSpeed", 104 | is_bench_pytorch=False, 105 | ) 106 | -------------------------------------------------------------------------------- /download.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ################################################################################ 4 | # Script: download.sh 5 | # Description: Downloads files from a list of URLs specified in a JSON file. 6 | # The JSON file should contain an array of objects, each with a 'url', 'file', 7 | # and 'folder' property. The script checks if the file already exists before 8 | # downloading it. 9 | # 10 | # Usage: ./download.sh --models --cache --force-download 11 | # 12 | # Example: 13 | # ./download.sh --models models.json --cache cache.log --force-download 14 | ################################################################################ 15 | 16 | set -euo pipefail 17 | 18 | # Default values 19 | models_file="$(pwd)/models.json" 20 | cache_file="$(pwd)/cache.log" 21 | force_download=false 22 | 23 | # Function to download a file 24 | download_file() { 25 | local url=$1 26 | local file=$2 27 | local dir=$3 28 | 29 | # Create the directory if it does not exist 30 | mkdir -p "$dir" 31 | 32 | # Download the file 33 | wget -N "$url" -O "$dir/$file" 34 | echo "$url" >> "$cache_file" 35 | } 36 | 37 | # Function to unzip a file 38 | unzip_file() { 39 | local file=$1 40 | local dir=$2 41 | 42 | # Unzip the file 43 | unzip -o "$file" -d "$dir" 44 | 45 | # Move the unzipped files to the parent directory 46 | find "$dir" -mindepth 2 -type f -exec mv {} "$dir" \; 47 | } 48 | 49 | # Function to remove a file 50 | remove_file() { 51 | local file=$1 52 | 53 | # Remove the file 54 | rm "$file" 55 | } 56 | 57 | # Argument parsing 58 | while [[ $# -gt 0 ]]; do 59 | key="$1" 60 | 61 | case $key in 62 | --models) 63 | models_file="$2" 64 | shift # past argument 65 | shift # past value 66 | ;; 67 | --cache) 68 | cache_file="$2" 69 | shift # past argument 70 | shift # past value 71 | ;; 72 | --force-download) 73 | force_download=true 74 | shift # past argument 75 | ;; 76 | *) 77 | echo "Unknown option: $1" 78 | exit 1 79 | ;; 80 | esac 81 | done 82 | 83 | # Check if the required arguments are provided 84 | if [ -z "$models_file" ] || [ -z "$cache_file" ]; then 85 | echo "Usage: $0 --models --cache [--force-download]" 86 | exit 1 87 | fi 88 | 89 | # Check if the JSON file exists 90 | if [ ! -f "$models_file" ]; then 91 | echo "Error: JSON file '$models_file' does not exist." 92 | exit 1 93 | fi 94 | 95 | # Check if force download is enabled 96 | if $force_download; then 97 | echo "Force download enabled. Removing all files in the models folder and cache file." 98 | rm -rf ./models/* 99 | rm "$cache_file" 100 | fi 101 | 102 | # Read the JSON file 103 | json=$(cat "$models_file") 104 | 105 | # Parse the JSON file and iterate over its elements 106 | echo "$json" | jq -r '.[] | @base64' | while read -r i; do 107 | _jq() { 108 | echo "${i}" | base64 --decode | jq -r "${1}" 109 | } 110 | 111 | url=$(_jq '.url') 112 | file=$(_jq '.file') 113 | folder=$(_jq '.folder') 114 | 115 | # Check if the URL is in the log file 116 | if ! grep -q "$url" "$cache_file"; then 117 | if [[ $file == *.zip ]]; then 118 | echo "Downloading and unzipping: $url to $folder" 119 | download_file "$url" "$file" "$folder" 120 | unzip_file "$folder/$file" "$folder" 121 | echo "Removing: $folder/$file" 122 | remove_file "$folder/$file" 123 | else 124 | echo "Downloading: $url to $folder/$file" 125 | download_file "$url" "$file" "$folder" 126 | fi 127 | fi 128 | done 129 | -------------------------------------------------------------------------------- /bench_ctranslate/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ################################################################################ 4 | # Script: setup.sh 5 | # Description: This script automates the setup of a virtual environment, 6 | # installs project requirements, converts model. 7 | ################################################################################ 8 | 9 | set -euo pipefail 10 | 11 | CURRENT_DIR="$(pwd)" 12 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 13 | VENV_DIR="$SCRIPT_DIR/venv" 14 | 15 | check_python() { 16 | if command -v python &> /dev/null; then 17 | PYTHON_CMD="python" 18 | elif command -v python3 &> /dev/null; then 19 | PYTHON_CMD="python3" 20 | else 21 | echo "Python is not installed." 22 | exit 1 23 | fi 24 | } 25 | 26 | 27 | build_and_compile_model () { 28 | local MODEL_NAME="$1" 29 | local PRECISION="$2" 30 | 31 | valid_precisions=("float32" "float16" "int8") 32 | 33 | # shellcheck disable=SC2199 34 | # shellcheck disable=SC2076 35 | if [[ ! " ${valid_precisions[@]} " =~ " $PRECISION " ]]; then 36 | echo "Invalid PRECISION value. Supported values are ${valid_precisions[*]}." 37 | exit 1 38 | fi 39 | 40 | if [[ "$MODEL_NAME" == "llama" ]]; then 41 | local model_download_path="$CURRENT_DIR/models/llama-2-7b-chat-ctranslate2-$PRECISION" 42 | local model_to_convert="$CURRENT_DIR/models/llama-2-7b-chat-hf" 43 | 44 | elif [[ "$MODEL_NAME" == "mistral" ]]; then 45 | local model_download_path="$CURRENT_DIR/models/mistral-7b-v0.1-instruct-ctranslate2-$PRECISION" 46 | local model_to_convert="$CURRENT_DIR/models/mistral-7b-v0.1-instruct-hf" 47 | else 48 | echo "No such model is supported" 49 | exit 1 50 | fi 51 | 52 | 53 | if [ ! -d "$model_download_path" ]; then 54 | ct2-transformers-converter --model "$model_to_convert" --quantization "$PRECISION" --output_dir "$model_download_path" --copy_files tokenizer.model tokenizer_config.json tokenizer.json special_tokens_map.json --force 55 | echo "Model Build for model: $MODEL_NAME and precision: $PRECISION ran successfully" 56 | else 57 | echo "Download folder already exists" 58 | fi 59 | 60 | } 61 | 62 | 63 | build_and_compile_models() { 64 | local MODEL_NAME="$1" 65 | local PRECISIONS=("float32" "float16" "int8") 66 | 67 | for PRECISION in "${PRECISIONS[@]}"; do 68 | build_and_compile_model "$MODEL_NAME" "$PRECISION" 69 | done 70 | } 71 | 72 | 73 | MODEL_NAME="${1:-"llama"}" 74 | 75 | check_python 76 | 77 | if [ ! -d "$VENV_DIR" ]; then 78 | "$PYTHON_CMD" -m venv "$VENV_DIR" 79 | echo "Virtual environment '$VENV_DIR' created." 80 | 81 | # Activate virtual environment using specified activation scripts 82 | if [ -f "$VENV_DIR/bin/activate" ]; then 83 | # shellcheck disable=SC1091 84 | source "$VENV_DIR/bin/activate" 85 | else 86 | echo "Error: Unable to find virtual environment activation script." 87 | exit 1 88 | fi 89 | 90 | "$PYTHON_CMD" -m pip install --upgrade pip > /dev/null 91 | "$PYTHON_CMD" -m pip install -r "$SCRIPT_DIR/requirements.txt" --no-cache-dir > /dev/null 92 | else 93 | # Activate virtual environment using specified activation scripts 94 | if [ -f "$VENV_DIR/bin/activate" ]; then 95 | # shellcheck disable=SC1091 96 | source "$VENV_DIR/bin/activate" 97 | else 98 | echo "Error: Unable to find virtual environment activation script." 99 | exit 1 100 | fi 101 | fi 102 | 103 | 104 | build_and_compile_models "$MODEL_NAME" 105 | -------------------------------------------------------------------------------- /bench_autogptq/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ################################################################################ 4 | # Script: setup.sh 5 | # Description: Automates the setup of a virtual environment and installs project 6 | # requirements. 7 | ################################################################################ 8 | 9 | set -euo pipefail 10 | 11 | # Main script starts here. 12 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" 13 | VENV_DIR="$SCRIPT_DIR/venv" 14 | CURRENT_DIR="$(pwd)" 15 | 16 | # Set default folder paths for GPTQ weights 17 | LLAMA2_GPTQ_WEIGHTS_FOLDER="$CURRENT_DIR/models/llama-2-7b-chat-autogptq" 18 | MISTRAL_GPTQ_WEIGHTS_FOLDER="$CURRENT_DIR/models/mistral-7b-v0.1-instruct-autogptq" 19 | 20 | 21 | check_python() { 22 | if command -v python &> /dev/null; then 23 | PYTHON_CMD="python" 24 | elif command -v python3 &> /dev/null; then 25 | PYTHON_CMD="python3" 26 | else 27 | echo "Python is not installed." 28 | exit 1 29 | fi 30 | } 31 | 32 | download_gptq_weights() { 33 | local MODEL_NAME="$1" 34 | 35 | # Set download directory based on MODEL_NAME 36 | if [ "$MODEL_NAME" = "llama" ]; then 37 | DOWNLOAD_DIR="$LLAMA2_GPTQ_WEIGHTS_FOLDER" 38 | MODEL_IDENTIFIER="TheBloke/Llama-2-7B-Chat-GPTQ" 39 | elif [ "$MODEL_NAME" = "mistral" ]; then 40 | DOWNLOAD_DIR="$MISTRAL_GPTQ_WEIGHTS_FOLDER" 41 | MODEL_IDENTIFIER="TheBloke/Mistral-7B-Instruct-v0.1-GPTQ" 42 | else 43 | echo "Invalid MODEL_NAME. Supported values: 'llama', 'mistral'" 44 | exit 1 45 | fi 46 | 47 | # Check if weights folder exists 48 | echo "$DOWNLOAD_DIR" 49 | 50 | if [ ! -d "$DOWNLOAD_DIR" ]; then 51 | # Download weights using huggingface-cli 52 | echo "Downloading weights to $DOWNLOAD_DIR..." 53 | huggingface-cli download "$MODEL_IDENTIFIER" --local-dir "$DOWNLOAD_DIR" --exclude "*.git*" "*.md" "Notice" "LICENSE" 54 | else 55 | echo "Weights already downloaded" 56 | fi 57 | } 58 | 59 | install_autogptq() { 60 | if [ -d "$SCRIPT_DIR/AutoGPTQ" ]; then 61 | echo "Removing existing AutoGPTQ directory..." 62 | rm -rf "$SCRIPT_DIR"/AutoGPTQ 63 | fi 64 | 65 | git clone https://github.com/PanQiWei/AutoGPTQ.git "$SCRIPT_DIR"/AutoGPTQ 66 | cd "$SCRIPT_DIR"/AutoGPTQ 67 | 68 | # Now build 69 | 70 | "$PYTHON_CMD" setup.py install 71 | 72 | # come out of the dir 73 | cd "$SCRIPT_DIR" 74 | } 75 | 76 | check_python 77 | 78 | if [ ! -d "$VENV_DIR" ]; then 79 | "$PYTHON_CMD" -m venv "$VENV_DIR" 80 | echo "Virtual environment '$VENV_DIR' created." 81 | 82 | if [ -f "$VENV_DIR/bin/activate" ]; then 83 | # shellcheck disable=SC1091 84 | source "$VENV_DIR/bin/activate" 85 | else 86 | echo "Error: Unable to find virtual environment activation script." 87 | exit 1 88 | fi 89 | 90 | "$PYTHON_CMD" -m pip install --upgrade pip > /dev/null 91 | "$PYTHON_CMD" -m pip install -r "$SCRIPT_DIR/requirements.txt" --no-cache-dir > /dev/null 92 | 93 | "$PYTHON_CMD" -m pip uninstall -y fsspec 94 | 95 | # Install the required version of fsspec 96 | "$PYTHON_CMD" -m pip install 'fsspec[http]>=2023.1.0,<=2024.2.0' 97 | 98 | install_autogptq 99 | else 100 | if [ -f "$VENV_DIR/bin/activate" ]; then 101 | # shellcheck disable=SC1091 102 | source "$VENV_DIR/bin/activate" 103 | else 104 | echo "Error: Unable to find virtual environment activation script." 105 | exit 1 106 | fi 107 | fi 108 | 109 | 110 | MODEL_NAME="${1:-"llama"}" # Use the first argument as MODEL_NAME if provided 111 | download_gptq_weights "$MODEL_NAME" 112 | -------------------------------------------------------------------------------- /bench_lightning/bench.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from pathlib import Path 4 | 5 | from transformers import AutoTokenizer 6 | 7 | sys.path.append(os.getcwd()) 8 | 9 | from bench_lightning.inference import generate, load_model # noqa 10 | from common.base import BaseBenchmarkClass # noqa 11 | from common.utils import launch_cli, make_report # noqa 12 | 13 | 14 | class PyTorchLightningBenchmark(BaseBenchmarkClass): 15 | def __init__( 16 | self, 17 | model_path: str, 18 | model_name: str, 19 | benchmark_name: str, 20 | precision: str, 21 | device: str, 22 | experiment_name: str, 23 | ) -> None: 24 | super().__init__( 25 | model_name=model_name, 26 | model_path=model_path, 27 | benchmark_name=benchmark_name, 28 | experiment_name=experiment_name, 29 | precision=precision, 30 | device=device, 31 | ) 32 | 33 | self.quantization_precision_mapping = { 34 | "float16": {"precision": "16-true", "quantize": None}, 35 | "float32": {"precision": "32-true", "quantize": None}, 36 | "int8": {"precision": "16-true", "quantize": "bnb.int8"}, 37 | } 38 | 39 | if model_name == "llama": 40 | self.tokenizer_folder = os.path.join( 41 | os.getcwd(), "models", "llama-2-7b-chat-hf" 42 | ) 43 | else: 44 | self.tokenizer_folder = os.path.join( 45 | os.getcwd(), "models", "mistral-7b-v0.1-instruct-hf" 46 | ) 47 | 48 | def load_model_and_tokenizer(self): 49 | self.model, self.lit_tokenizer, self.prompt_style, self.fabric = load_model( 50 | checkpoint_dir=self.model_path, 51 | quantize=self.quantization_precision_mapping[self.precision]["quantize"], 52 | precision=self.quantization_precision_mapping[self.precision]["precision"], 53 | ) 54 | 55 | self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_folder) 56 | return self 57 | 58 | def preprocess( 59 | self, prompt: str, chat_mode: bool = True, for_benchmarks: bool = True 60 | ): 61 | return {"prompt": prompt} 62 | 63 | def run_model(self, inputs: dict, max_tokens: int, temperature: float) -> dict: 64 | prompt = inputs["prompt"] 65 | output = generate( 66 | model=self.model, 67 | tokenizer=self.lit_tokenizer, 68 | prompt_style=self.prompt_style, 69 | fabric=self.fabric, 70 | prompt=prompt, 71 | max_new_tokens=max_tokens, 72 | temperature=temperature, 73 | ) 74 | 75 | output_prompt = self.tokenizer.decode( 76 | output["output_tokens"], skip_special_tokens=True 77 | ) 78 | return {**output, "output_prompt": output_prompt} 79 | 80 | def postprocess(self, output: dict) -> str: 81 | return output["output_prompt"] 82 | 83 | 84 | if __name__ == "__main__": 85 | parser = launch_cli(description="PyTorch Lightning") 86 | args = parser.parse_args() 87 | 88 | model_folder = os.path.join(os.getcwd(), "models") 89 | model_name = ( 90 | f"{args.model_name}-2-7b-chat-litgpt" 91 | if args.model_name == "llama" 92 | else f"{args.model_name}-7b-v0.1-instruct-litgpt" 93 | ) 94 | 95 | model_path = Path(os.path.join(model_folder, model_name)) 96 | 97 | runner_dict = { 98 | "cuda": [ 99 | {"precision": "float16", "model_path": model_path}, 100 | {"precision": "float32", "model_path": model_path}, 101 | {"precision": "int8", "model_path": model_path}, 102 | ] 103 | } 104 | 105 | make_report( 106 | args=args, 107 | benchmark_class=PyTorchLightningBenchmark, 108 | runner_dict=runner_dict, 109 | benchmark_name="PyTorch Lightning", 110 | is_bench_pytorch=False, 111 | ) 112 | -------------------------------------------------------------------------------- /questions.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "prompt": "I'm making pancakes for breakfast. I added a cup of flour, a teaspoon of salt, and a few tablespoons of sugar to a bowl. I stirred it together, then added a cup of milk, a beaten egg, and a few tablespoons of oil, and stirred until just mixed. Then I put 1/4 a cup on a hot frying pan, and flipped it when brown. But they're terrible! Why? List the main reason. Answer as much precise as possible with one sentence.", 4 | "max_tokens": 120, 5 | "temperature": 0.1, 6 | "expected": { 7 | "llama": " The main reason your pancakes are terrible is because you did not mix the batter long enough, resulting in a dense and flat pancake.", 8 | "mistral": "he main reason for the terrible pancakes could be that the batter was too thick, which resulted in a dry and tough texture." 9 | } 10 | }, 11 | { 12 | "prompt": "42 birds are sitting on a tree branch. A hunter passes, shoots one dead, and misses two. How many birds are left on the branch? Answer as much precise as possible with one sentence.", 13 | "max_tokens": 120, 14 | "temperature": 0.1, 15 | "expected": { 16 | "llama": " After the hunter shoots and kills one bird, 41 birds remain on the branch.", 17 | "mistral": "One bird is left on the branch." 18 | } 19 | }, 20 | { 21 | "prompt": "How many mistakes can you find in the following sentence? Sentence: Its to much to loose if your talking about hundred’s of dollars. Answer as much precise as possible with one sentence.", 22 | "max_tokens": 120, 23 | "temperature": 0.1, 24 | "expected": { 25 | "llama": " There are 2 mistakes in the sentence: \"to loose\" should be \"to lose\" and \"hundred's\" should be \"hundreds\".", 26 | "mistral": "There are two mistakes in the sentence: \"Its\" should be \"It's\" and \"to much\" should be \"too much\"." 27 | } 28 | }, 29 | { 30 | "prompt": "You are an expert on agricultural chemical compounds. Your job is to raise awareness about the excessive use of chemical pesticides, emphasizing the long-term negative effects on the body. Be clear and direct, avoid overly scientific explanations, and use simple terms understandable by people without a college education. Answer as much precise as possible with one sentence.", 31 | "max_tokens": 120, 32 | "temperature": 0.1, 33 | "expected": { 34 | "llama": " Excessive use of chemical pesticides can have long-term negative effects on the body, including cancer, hormone disruption, and neurological problems, due to their toxic and persistent nature, which can accumulate in the body over time and cause irreversible damage. It is important to adopt sustainable and organic farming practices that prioritize the health of both people and the environment, such as crop rotation, composting, and integrated pest management, to minimize the use of harmful chemicals and promote a healthier", 35 | "mistral": "Excessive use of chemical pesticides can lead to long-term negative health effects, including cancer, neurological damage, and reproductive issues." 36 | } 37 | }, 38 | { 39 | "prompt": "During the Manhattan Project, John von Neumann suggested how to maximize the destruction caused by a detonating bomb. Which was his idea? Answer as much precise as possible with one sentence.", 40 | "max_tokens": 120, 41 | "temperature": 0.1, 42 | "expected": { 43 | "llama": " John von Neumann proposed the concept of a \"fusion bomb,\" which would achieve maximum destruction through the simultaneous detonation of a fission bomb and a fusion reaction, creating a chain reaction of nuclear reactions that would release an enormous amount of energy. This idea was later developed into the hydrogen bomb.", 44 | "mistral": "John von Neumann proposed the concept of a \"superbomb\" that would use multiple nuclear explosions to create a massive explosion, causing widespread destruction." 45 | } 46 | } 47 | ] 48 | -------------------------------------------------------------------------------- /bench_llamacpp/bench.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | from llama_cpp import Llama 5 | from transformers import AutoTokenizer 6 | 7 | sys.path.append(os.getcwd()) 8 | 9 | from common.base import BaseBenchmarkClass # noqa 10 | from common.utils import launch_cli, make_report # noqa 11 | 12 | 13 | class LlamaCPPBenchmark(BaseBenchmarkClass): 14 | def __init__( 15 | self, 16 | model_path: str, 17 | model_name: str, 18 | benchmark_name: str, 19 | precision: str, 20 | device: str, 21 | experiment_name: str, 22 | ) -> None: 23 | assert precision in ["int8", "int4"], ValueError( 24 | "Precision should set either 'int8' or 'int4'" 25 | ) 26 | super().__init__( 27 | model_name=model_name, 28 | model_path=model_path, 29 | benchmark_name=benchmark_name, 30 | experiment_name=experiment_name, 31 | precision=precision, 32 | device=device, 33 | ) 34 | 35 | if model_name == "llama": 36 | self.tokenizer_folder = os.path.join( 37 | os.getcwd(), "models", "llama-2-7b-chat-hf" 38 | ) 39 | else: 40 | self.tokenizer_folder = os.path.join( 41 | os.getcwd(), "models", "mistral-7b-v0.1-instruct-hf" 42 | ) 43 | 44 | def load_model_and_tokenizer(self): 45 | self.model = Llama( 46 | model_path=self.model_path, 47 | n_gpu_layers=0 if self.device == "cpu" else -1, 48 | verbose=True, 49 | ) 50 | self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_folder) 51 | return self 52 | 53 | def preprocess( 54 | self, prompt: str, chat_mode: bool = True, for_benchmarks: bool = True 55 | ): 56 | if chat_mode: 57 | template = self.get_chat_template_with_instruction( 58 | prompt=prompt, for_benchmarks=for_benchmarks 59 | ) 60 | prompt = self.tokenizer.apply_chat_template(template, tokenize=False) 61 | 62 | tokenized_input = self.tokenizer.encode(text=prompt) 63 | return { 64 | "prompt": prompt, 65 | "input_tokens": tokenized_input, 66 | "tensor": None, 67 | "num_input_tokens": len(tokenized_input), 68 | } 69 | 70 | def run_model( 71 | self, inputs: dict, max_tokens: int, temperature: float = 0.1 72 | ) -> dict: 73 | prompt = inputs["prompt"] 74 | output = self.model.create_completion( 75 | prompt, max_tokens=max_tokens, temperature=temperature 76 | ) 77 | 78 | output_prompt = output["choices"][0]["text"] 79 | num_tokens = output["usage"]["completion_tokens"] 80 | return {"output_prompt": output_prompt, "num_output_tokens": num_tokens} 81 | 82 | def postprocess(self, output: dict) -> str: 83 | return output["output_prompt"] 84 | 85 | 86 | if __name__ == "__main__": 87 | parser = launch_cli(description="LlamaCPP Benchmark.") 88 | args = parser.parse_args() 89 | 90 | model_folder = os.path.join(os.getcwd(), "models") 91 | model_name = ( 92 | f"{args.model_name}-2-7b-chat-gguf/llama-2-7b-chat." 93 | if args.model_name == "llama" 94 | else f"{args.model_name}-7b-v0.1-instruct-gguf/mistral-7b-instruct-v0.1." 95 | ) 96 | 97 | runner_dict = { 98 | "cuda": [ 99 | { 100 | "precision": "int4", 101 | "model_path": os.path.join(model_folder, model_name + "Q4_K_M.gguf"), 102 | }, 103 | { 104 | "precision": "int8", 105 | "model_path": os.path.join(model_folder, model_name + "Q8_0.gguf"), 106 | }, 107 | ] 108 | } 109 | 110 | make_report( 111 | args=args, 112 | benchmark_class=LlamaCPPBenchmark, 113 | runner_dict=runner_dict, 114 | benchmark_name="LlamaCPP", 115 | is_bench_pytorch=False, 116 | ) 117 | -------------------------------------------------------------------------------- /docs/archive.md: -------------------------------------------------------------------------------- 1 | # ⚙️ Benchmarking ML Engines 2 | 3 | This file contains numbers for different engines and precision. Since a lot of upgrades in models and engines were made. So these 4 | results are now archived. However latest implementation does not have benchmarks for Metal or Mac CPU. So if you want to see that, feel free to check those out here. 5 | 6 | ## A100 80GB Inference Bench: 7 | 8 | **Environment:** 9 | - Model: LLAMA-2-7B 10 | - CUDA Version: 11.7 11 | - Command: `./benchmark.sh --repetitions 10 --max_tokens 512 --device cuda --prompt 'Write an essay about the transformer model architecture'` 12 | 13 | **Performance Metrics:** (unit: Tokens / second) 14 | 15 | | Engine | float32 | float16 | int8 | int4 | 16 | | ------------------------------------------ | ------------- | ------------- | ------------- | -------------- | 17 | | [candle](/bench_candle/) | - | 36.78 ± 2.17 | - | - | 18 | | [llama.cpp](/bench_llamacpp/) | - | - | 79.15 ± 1.20 | 100.90 ± 1.46 | 19 | | [ctranslate](/bench_ctranslate/) | 35.23 ± 4.01 | 55.72 ± 16.66 | 35.73 ± 10.87 | - | 20 | | [onnx](/bench_onnxruntime/) | - | 54.16 ± 3.15 | - | - | 21 | | [transformers (pytorch)](/bench_pytorch/) | 43.79 ± 0.61 | 46.39 ± 0.28 | 6.98 ± 0.05 | 21.72 ± 0.11 | 22 | | [vllm](/bench_vllm/) | 90.78 ± 1.60 | 90.54 ± 2.22 | - | 114.69 ± 11.20 | 23 | | [exllamav2](/bench_exllamav2/) | - | - | 121.63 ± 0.74 | 130.16 ± 0.35 | 24 | | [ctransformers](/bench_ctransformers/) | - | - | 76.75 ± 10.36 | 84.26 ± 5.79 | 25 | | [AutoGPTQ](/bench_autogptq/) | 42.01 ± 1.03 | 30.24 ± 0.41 | - | - | 26 | | [AutoAWQ](/bench_autoawq/) | - | - | - | 109.20 ± 3.28 | 27 | | [DeepSpeed](/bench_deepspeed/) | - | 81.44 ± 8.13 | - | | 28 | | [PyTorch Lightning](/bench_lightning/) | 24.85 ± 0.07 | 44.56 ± 2.89 | 10.50 ± 0.12 | 24.83 ± 0.05 | 29 | | [Optimum Nvidia](/bench_optimum_nvidia/) | 110.36 ± 0.52 | 109.09 ± 4.26 | - | - | 30 | | [Nvidia TensorRT-LLM](/bench_tensorrtllm/) | 55.19 ± 1.03 | 85.03 ± 0.62 | 167.66 ± 2.05 | 235.18 ± 3.20 | 31 | 32 | *(Data updated: `05th April 2024`) 33 | 34 | 35 | ## M2 MAX 32GB Inference Bench: 36 | 37 | ### CPU 38 | 39 | **Environment:** 40 | - Model: LLAMA-2-7B 41 | - CUDA Version: NA 42 | - Command: `./benchmark.sh --repetitions 10 --max_tokens 512 --device cpu --prompt 'Write an essay about the transformer model architecture'` 43 | 44 | **Performance Metrics:** (unit: Tokens / second) 45 | | Engine | float32 | float16 | int8 | int4 | 46 | | -------------------------------------- | ------- | ----------- | ------------ | ------------ | 47 | | [candle](/bench_candle/) | - | 3.43 ± 0.02 | - | - | 48 | | [llama.cpp](/bench_llamacpp/) | - | - | 13.24 ± 0.62 | 21.43 ± 0.47 | 49 | | [ctranslate](/bench_ctranslate/) | - | - | 1.87 ± 0.14 | - | 50 | | [ctransformers](/bench_ctransformers/) | - | - | 13.50 ± 0.48 | 20.57 ± 2.50 | 51 | 52 | 53 | ### GPU (Metal) 54 | 55 | **Command:** `./benchmark.sh --repetitions 10 --max_tokens 512 --device metal --prompt 'Write an essay about the transformer model architecture'` 56 | 57 | **Performance Metrics:** (unit: Tokens / second) 58 | | Engine | float32 | float16 | int8 | int4 | 59 | | -------------------------------------- | ------- | ------- | ------------ | ------------ | 60 | | [llama.cpp](/bench_llamacpp/) | - | - | 30.11 ± 0.45 | 44.27 ± 0.12 | 61 | | [ctransformers](/bench_ctransformers/) | - | - | 20.75 ± 0.36 | 34.04 ± 2.11 | 62 | 63 | *(Data updated: `05th April 2024`) 64 | -------------------------------------------------------------------------------- /bench_ctranslate/bench.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import ctranslate2 5 | from transformers import AutoTokenizer 6 | 7 | # have to hard code this thing 8 | sys.path.append(os.getcwd()) 9 | 10 | from common.base import BaseBenchmarkClass # noqa 11 | from common.utils import launch_cli, make_report # noqa 12 | 13 | 14 | class CTranslateBenchmark(BaseBenchmarkClass): 15 | def __init__( 16 | self, 17 | model_path: str, 18 | model_name: str, 19 | benchmark_name: str, 20 | precision: str, 21 | device: str, 22 | experiment_name: str, 23 | ) -> None: 24 | assert precision in ["float32", "float16", "int8"], ValueError( 25 | "Precision other than: 'float32', 'float16', 'int8' are not supported" 26 | ) 27 | super().__init__( 28 | model_path=model_path, 29 | model_name=model_name, 30 | benchmark_name=benchmark_name, 31 | precision=precision, 32 | device=device, 33 | experiment_name=experiment_name, 34 | ) 35 | 36 | def load_model_and_tokenizer(self): 37 | self.model = ctranslate2.Generator(self.model_path, device=self.device) 38 | 39 | self.tokenizer = AutoTokenizer.from_pretrained(self.model_path) 40 | return self 41 | 42 | def preprocess(self, prompt: str, chat_mode: bool = True, for_benchmarks=True): 43 | if chat_mode: 44 | template = self.get_chat_template_with_instruction( 45 | prompt=prompt, for_benchmarks=for_benchmarks 46 | ) 47 | prompt = self.tokenizer.apply_chat_template(template, tokenize=False) 48 | 49 | tokenized_input = self.tokenizer.convert_ids_to_tokens( 50 | self.tokenizer.encode(prompt) 51 | ) 52 | return { 53 | "prompt": prompt, 54 | "input_tokens": tokenized_input, 55 | "tensor": None, 56 | "num_input_tokens": len(tokenized_input), 57 | } 58 | 59 | def run_model( 60 | self, inputs: dict, max_tokens: int, temperature: float = 0.1 61 | ) -> dict: 62 | tokenized_input = inputs["input_tokens"] 63 | num_input_tokens = inputs["num_input_tokens"] - 1 64 | 65 | output = self.model.generate_batch( 66 | [tokenized_input], max_length=max_tokens, sampling_temperature=0.1 67 | ) 68 | 69 | output_tokens = output[0].sequences_ids[0][num_input_tokens:] 70 | output_prompt = self.tokenizer.decode(output_tokens, skip_special_tokens=True) 71 | return { 72 | "output_prompt": output_prompt, 73 | "output_tokens": output_tokens, 74 | "num_output_tokens": len(output_tokens), 75 | } 76 | 77 | def postprocess(self, output: dict) -> str: 78 | return output["output_prompt"] 79 | 80 | 81 | if __name__ == "__main__": 82 | parser = launch_cli(description="CTransformers Benchmark.") 83 | args = parser.parse_args() 84 | 85 | model_folder = os.path.join(os.getcwd(), "models") 86 | model_name = ( 87 | f"{args.model_name}-2-7b-chat-ctranslate2-" 88 | if args.model_name == "llama" 89 | else f"{args.model_name}-7b-v0.1-instruct-ctranslate2-" 90 | ) 91 | 92 | runner_dict = { 93 | "cuda": [ 94 | { 95 | "precision": "float32", 96 | "model_path": os.path.join(model_folder, model_name + "float32"), 97 | }, 98 | { 99 | "precision": "float16", 100 | "model_path": os.path.join(model_folder, model_name + "float16"), 101 | }, 102 | { 103 | "precision": "int8", 104 | "model_path": os.path.join(model_folder, model_name + "int8"), 105 | }, 106 | ] 107 | } 108 | 109 | make_report( 110 | args=args, 111 | benchmark_class=CTranslateBenchmark, 112 | runner_dict=runner_dict, 113 | benchmark_name="CTranslate2", 114 | is_bench_pytorch=False, 115 | ) 116 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | 162 | # don't check-in sub folder 163 | models/* 164 | !models/.gitkeep 165 | 166 | # Repositories 167 | bench_tinygrad/tinygrad 168 | bench_burn/llama2-burn 169 | bench_exllamav2/exllamav2 170 | bench_exllamav2/wikitext-test.parquet 171 | bench_lightning/lit-gpt 172 | -------------------------------------------------------------------------------- /bench_autoawq/bench.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import sys 4 | 5 | import torch 6 | from awq import AutoAWQForCausalLM 7 | from transformers import AutoTokenizer 8 | 9 | sys.path.append(os.getcwd()) 10 | 11 | from common.base import BaseBenchmarkClass # noqa 12 | from common.utils import launch_cli, make_report # noqa 13 | 14 | 15 | class AutoAWQBenchmark(BaseBenchmarkClass): 16 | def __init__( 17 | self, 18 | model_path: str, 19 | model_name: str, 20 | benchmark_name: str, 21 | precision: str, 22 | device: str, 23 | experiment_name: str, 24 | ) -> None: 25 | super().__init__( 26 | model_name=model_name, 27 | model_path=model_path, 28 | benchmark_name=benchmark_name, 29 | experiment_name=experiment_name, 30 | precision=precision, 31 | device=device, 32 | ) 33 | 34 | # Have to do this step 35 | # since tokenizer in autoawq is not the instruction tuned one for the instruction tuned model 36 | 37 | if model_name == "llama": 38 | self.tokenizer_folder = os.path.join( 39 | os.getcwd(), "models", "llama-2-7b-chat-hf" 40 | ) 41 | else: 42 | self.tokenizer_folder = os.path.join( 43 | os.getcwd(), "models", "mistral-7b-v0.1-instruct-hf" 44 | ) 45 | 46 | def load_model_and_tokenizer(self): 47 | self.model = AutoAWQForCausalLM.from_quantized( 48 | self.model_path, fuse_layers=True, safetensors=True, strict=False 49 | ) 50 | 51 | self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_folder) 52 | return self 53 | 54 | def preprocess(self, prompt: str, chat_mode: bool = True, for_benchmarks=True): 55 | if chat_mode: 56 | template = self.get_chat_template_with_instruction( 57 | prompt=prompt, for_benchmarks=for_benchmarks 58 | ) 59 | prompt = self.tokenizer.apply_chat_template(template, tokenize=False) 60 | 61 | tokenized_input = self.tokenizer.encode(text=prompt) 62 | tensor = self.tokenizer.encode(prompt, return_tensors="pt").to(self.device) 63 | return { 64 | "prompt": prompt, 65 | "input_tokens": tokenized_input, 66 | "tensor": tensor, 67 | "num_input_tokens": len(tokenized_input), 68 | } 69 | 70 | def run_model(self, inputs: dict, max_tokens: int, temperature: float) -> dict: 71 | tensor = inputs["tensor"] 72 | num_input_tokens = inputs["num_input_tokens"] 73 | 74 | output = ( 75 | self.model.generate( 76 | input_ids=tensor, 77 | max_new_tokens=max_tokens, 78 | temperature=temperature, 79 | do_sample=True, 80 | ) 81 | .detach() 82 | .tolist()[0] 83 | ) 84 | 85 | output_tokens = ( 86 | output[num_input_tokens:] if len(output) > num_input_tokens else output 87 | ) 88 | 89 | return {"output_tokens": output_tokens, "num_output_tokens": len(output_tokens)} 90 | 91 | def postprocess(self, output: dict) -> str: 92 | output_tokens = output["output_tokens"] 93 | return self.tokenizer.decode(output_tokens, skip_special_tokens=True) 94 | 95 | def on_exit(self): 96 | if self.device == "cuda:0": 97 | del self.model 98 | torch.cuda.synchronize() 99 | else: 100 | del self.model 101 | 102 | 103 | if __name__ == "__main__": 104 | parser = launch_cli(description="AWQ Benchmark.") 105 | args = parser.parse_args() 106 | 107 | model_folder = os.path.join(os.getcwd(), "models") 108 | model_name = ( 109 | f"{args.model_name}-2-7b-chat-autoawq" 110 | if args.model_name == "llama" 111 | else f"{args.model_name}-7b-v0.1-instruct-autoawq" 112 | ) 113 | 114 | runner_dict = { 115 | "cuda": [ 116 | {"precision": "int4", "model_path": os.path.join(model_folder, model_name)} 117 | ] 118 | } 119 | 120 | if args.device == "cpu": 121 | logging.info("Skipping running model on int4 on CPU, not implemented for Half") 122 | pass 123 | else: 124 | make_report( 125 | args=args, 126 | benchmark_class=AutoAWQBenchmark, 127 | runner_dict=runner_dict, 128 | benchmark_name="AutoAWQ", 129 | is_bench_pytorch=False, 130 | ) 131 | -------------------------------------------------------------------------------- /bench_candle/convert_to_safetensors.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import os 4 | import shutil 5 | from collections import defaultdict 6 | 7 | import torch 8 | from safetensors.torch import load_file, save_file 9 | 10 | 11 | def setup_logging(): 12 | logging.basicConfig( 13 | level=logging.INFO, 14 | format="%(asctime)s [%(levelname)s]: %(message)s", 15 | handlers=[logging.StreamHandler()], 16 | ) 17 | 18 | 19 | def shared_pointers(tensors): 20 | ptrs = defaultdict(list) 21 | for k, v in tensors.items(): 22 | ptrs[v.data_ptr()].append(k) 23 | failing = [] 24 | for _, names in ptrs.items(): 25 | if len(names) > 1: 26 | failing.append(names) 27 | return failing 28 | 29 | 30 | def check_file_size(sf_filename: str, pt_filename: str): 31 | sf_size = os.stat(sf_filename).st_size 32 | pt_size = os.stat(pt_filename).st_size 33 | 34 | if (sf_size - pt_size) / pt_size > 0.01: 35 | raise RuntimeError( 36 | f"The file size different is more than 1%:\n - {sf_filename}: {sf_size}\n - {pt_filename}: {pt_size}" 37 | ) 38 | 39 | 40 | def rename(pt_filename: str) -> str: 41 | filename, _ = os.path.splitext(pt_filename) 42 | local = f"{filename}.safetensors" 43 | local = local.replace("pytorch_model", "model") 44 | return local 45 | 46 | 47 | def copy_file(src: str, dest: str): 48 | try: 49 | shutil.copy(src, dest) 50 | logging.info(f"Copying {src} to {dest}") 51 | except FileNotFoundError: 52 | logging.warning(f"{src} not found. Skipping copy.") 53 | 54 | 55 | def convert_file(pt_filename: str, sf_filename: str): 56 | loaded = torch.load(pt_filename, map_location="cpu") 57 | if "state_dict" in loaded: 58 | loaded = loaded["state_dict"] 59 | shared = shared_pointers(loaded) 60 | for shared_weights in shared: 61 | for name in shared_weights[1:]: 62 | loaded.pop(name) 63 | 64 | # For tensors to be contiguous 65 | loaded = {k: v.contiguous() for k, v in loaded.items()} 66 | 67 | # Adjust sf_filename to ensure correct formatting 68 | sf_filename = os.path.join( 69 | os.path.dirname(sf_filename), os.path.basename(rename(pt_filename)) 70 | ) 71 | 72 | save_file(loaded, sf_filename, metadata={"format": "pt"}) 73 | check_file_size(sf_filename, pt_filename) 74 | reloaded = load_file(sf_filename) 75 | for k in loaded: 76 | pt_tensor = loaded[k] 77 | sf_tensor = reloaded[k] 78 | if not torch.equal(pt_tensor, sf_tensor): 79 | raise RuntimeError(f"The output tensors do not match for key {k}") 80 | 81 | 82 | def convert_multi(input_dir: str, output_dir: str) -> list[str]: 83 | if os.path.exists(output_dir): 84 | logging.warning(f"{output_dir} already exists!") 85 | return [] 86 | else: 87 | os.mkdir(output_dir) 88 | 89 | config_src = os.path.join(input_dir, "config.json") 90 | tokenizer_src = os.path.join(input_dir, "tokenizer.json") 91 | 92 | if not os.path.exists(config_src) or not os.path.exists(tokenizer_src): 93 | logging.warning(f"{config_src} or {tokenizer_src} not found. Skipping copy.") 94 | return [] 95 | else: 96 | copy_file(config_src, output_dir) 97 | copy_file(tokenizer_src, output_dir) 98 | 99 | filenames = [file for file in os.listdir(input_dir) if file.endswith(".bin")] 100 | 101 | local_filenames = [] 102 | for filename in filenames: 103 | pt_filename = os.path.join(input_dir, filename) 104 | 105 | sf_filename = rename(pt_filename) 106 | sf_filename = os.path.join(output_dir, os.path.basename(sf_filename)) 107 | 108 | logging.info(f"Converting {pt_filename} to {sf_filename}") 109 | convert_file(pt_filename, sf_filename) 110 | local_filenames.append(sf_filename) 111 | 112 | return local_filenames 113 | 114 | 115 | if __name__ == "__main__": 116 | setup_logging() 117 | 118 | parser = argparse.ArgumentParser(description="Convert .bin files to .safetensors") 119 | parser.add_argument( 120 | "--input_dir", 121 | type=str, 122 | help="Path to the input directory containing .bin files", 123 | ) 124 | parser.add_argument( 125 | "--output_dir", 126 | type=str, 127 | help="Path to the output directory for .safetensors files", 128 | ) 129 | args = parser.parse_args() 130 | 131 | output_filenames = convert_multi(args.input_dir, args.output_dir) 132 | 133 | logging.info("Conversion successful. Output files:") 134 | for filename in output_filenames: 135 | logging.info(filename) 136 | -------------------------------------------------------------------------------- /bench_ctransformers/bench.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import torch 5 | from ctransformers import AutoModelForCausalLM 6 | from transformers import AutoTokenizer 7 | 8 | sys.path.append(os.getcwd()) 9 | 10 | from common.base import BaseBenchmarkClass # noqa 11 | from common.utils import launch_cli, make_report # noqa 12 | 13 | 14 | class CTransformersBenchmark(BaseBenchmarkClass): 15 | def __init__( 16 | self, 17 | model_path: str, 18 | model_name: str, 19 | benchmark_name: str, 20 | precision: str, 21 | device: str, 22 | experiment_name: str, 23 | ) -> None: 24 | super().__init__( 25 | model_path=model_path, 26 | model_name=model_name, 27 | benchmark_name=benchmark_name, 28 | precision=precision, 29 | device=device, 30 | experiment_name=experiment_name, 31 | ) 32 | 33 | if model_name == "llama": 34 | self.tokenizer_folder = os.path.join( 35 | os.getcwd(), "models", "llama-2-7b-chat-hf" 36 | ) 37 | else: 38 | self.tokenizer_folder = os.path.join( 39 | os.getcwd(), "models", "mistral-7b-v0.1-instruct-hf" 40 | ) 41 | 42 | def load_model_and_tokenizer(self): 43 | self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_folder) 44 | 45 | model_file_mapping = { 46 | "llama": { 47 | "int4": "llama-2-7b-chat.Q4_K_M.gguf", 48 | "int8": "llama-2-7b-chat.Q8_0.gguf", 49 | }, 50 | "mistral": { 51 | "int4": "mistral-7b-instruct-v0.1.Q4_K_M.gguf", 52 | "int8": "mistral-7b-instruct-v0.1.Q8_0.gguf", 53 | }, 54 | } 55 | 56 | self.model = AutoModelForCausalLM.from_pretrained( 57 | self.model_path, 58 | model_file=model_file_mapping[self.model_name][self.precision], 59 | model_type=self.model_name, 60 | gpu_layers=50 if self.device in ["cuda", "metal"] else 0, 61 | # context_length=1024 (This exceeds the memory without changing the quality) 62 | ) 63 | return self 64 | 65 | def preprocess(self, prompt: str, chat_mode: bool = True, for_benchmarks=True): 66 | if chat_mode: 67 | template = self.get_chat_template_with_instruction( 68 | prompt=prompt, for_benchmarks=for_benchmarks 69 | ) 70 | prompt = self.tokenizer.apply_chat_template(template, tokenize=False) 71 | 72 | tokenized_input = self.tokenizer.encode(text=prompt) 73 | return { 74 | "prompt": prompt, 75 | "input_tokens": tokenized_input, 76 | "tensor": None, 77 | "num_input_tokens": len(tokenized_input), 78 | } 79 | 80 | def run_model(self, inputs: dict, max_tokens: int, temperature: float) -> dict: 81 | prompt = inputs["prompt"] 82 | output = self.model( 83 | prompt, stream=False, max_new_tokens=max_tokens, temperature=temperature 84 | ) 85 | generated_tokens = self.tokenizer.encode(output) 86 | 87 | # Note: CTransformers produces tokens after the input tokens 88 | return { 89 | "output_prompt": output, 90 | "output_tokens": generated_tokens, 91 | "num_output_tokens": len(generated_tokens), 92 | } 93 | 94 | def postprocess(self, output: dict) -> str: 95 | output_tokens = output["output_tokens"] 96 | return self.tokenizer.decode(output_tokens, skip_special_tokens=True) 97 | 98 | def on_exit(self): 99 | if self.device in ["cuda:0", "cuda"]: 100 | del self.model 101 | torch.cuda.synchronize() 102 | else: 103 | del self.model 104 | 105 | 106 | if __name__ == "__main__": 107 | parser = launch_cli(description="CTransformers Benchmark.") 108 | args = parser.parse_args() 109 | 110 | model_folder = os.path.join(os.getcwd(), "models") 111 | model_name = ( 112 | f"{args.model_name}-2-7b-chat-gguf" 113 | if args.model_name == "llama" 114 | else f"{args.model_name}-7b-v0.1-instruct-gguf" 115 | ) 116 | 117 | runner_dict = { 118 | "cuda": [ 119 | {"precision": "int4", "model_path": os.path.join(model_folder, model_name)}, 120 | {"precision": "int8", "model_path": os.path.join(model_folder, model_name)}, 121 | ] 122 | } 123 | 124 | make_report( 125 | args=args, 126 | benchmark_class=CTransformersBenchmark, 127 | runner_dict=runner_dict, 128 | benchmark_name="CTransformers", 129 | is_bench_pytorch=False, 130 | ) 131 | -------------------------------------------------------------------------------- /bench_optimum_nvidia/bench.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import torch 5 | from optimum.nvidia import AutoModelForCausalLM 6 | from transformers import AutoTokenizer 7 | 8 | sys.path.append("/mnt") 9 | sys.path.append("/mnt/benchmarks/") 10 | 11 | from common.base import BaseBenchmarkClass # noqa 12 | from common.utils import launch_cli, make_report # noqa 13 | 14 | 15 | class OptimumBenchmark(BaseBenchmarkClass): 16 | def __init__( 17 | self, 18 | model_path: str, 19 | model_name: str, 20 | benchmark_name: str, 21 | precision: str, 22 | device: str, 23 | experiment_name: str, 24 | ) -> None: 25 | assert precision in ["float32", "float16"], ValueError( 26 | "Supported precision: 'float32' and 'float16'" 27 | ) 28 | super().__init__( 29 | model_name=model_name, 30 | model_path=model_path, 31 | benchmark_name=benchmark_name, 32 | experiment_name=experiment_name, 33 | precision=precision, 34 | device=device, 35 | root_folder="/mnt/benchmarks", 36 | ) 37 | 38 | if model_name == "llama": 39 | self.tokenizer_folder = os.path.join( 40 | self.root_folder, "models", "llama-2-7b-chat-hf" 41 | ) 42 | else: 43 | self.tokenizer_folder = os.path.join( 44 | self.root_folder, "models", "mistral-7b-v0.1-instruct-hf" 45 | ) 46 | 47 | def load_model_and_tokenizer(self): 48 | dtype_mapper = {"float16": torch.float16, "float32": torch.float32} 49 | self.model = AutoModelForCausalLM.from_pretrained( 50 | pretrained_model_name_or_path=self.model_path, 51 | torch_dtype=dtype_mapper[self.precision], 52 | ) 53 | self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_folder) 54 | return self 55 | 56 | def preprocess(self, prompt: str, chat_mode: bool = True, for_benchmarks=True): 57 | if chat_mode: 58 | template = self.get_chat_template_with_instruction( 59 | prompt=prompt, for_benchmarks=for_benchmarks 60 | ) 61 | prompt = self.tokenizer.apply_chat_template(template, tokenize=False) 62 | 63 | tokenized_input = self.tokenizer.encode(text=prompt) 64 | tensor = self.tokenizer(prompt, return_tensors="pt") 65 | return { 66 | "prompt": prompt, 67 | "input_tokens": tokenized_input, 68 | "tensor": tensor, 69 | "num_input_tokens": len(tokenized_input), 70 | } 71 | 72 | def run_model(self, inputs: dict, max_tokens: int, temperature: float) -> dict: 73 | tensor = inputs["tensor"] 74 | num_input_tokens = inputs["num_input_tokens"] 75 | 76 | generated, _ = self.model.generate( 77 | **tensor, 78 | top_k=40, 79 | top_p=0.1, 80 | pad_token_id=self.tokenizer.eos_token_id, 81 | eos_token_id=self.tokenizer.eos_token_id, 82 | temperature=temperature, 83 | max_new_tokens=max_tokens, 84 | ) 85 | 86 | output_tokens = generated[0].detach().tolist()[num_input_tokens:] 87 | return {"output_tokens": output_tokens, "num_output_tokens": len(output_tokens)} 88 | 89 | def postprocess(self, output: dict) -> str: 90 | output_tokens = output["output_tokens"] 91 | output_text = self.tokenizer.decode(output_tokens, skip_special_tokens=True) 92 | return output_text 93 | 94 | def on_exit(self): 95 | if self.device == "cuda:0": 96 | del self.model 97 | torch.cuda.synchronize() 98 | else: 99 | del self.model 100 | 101 | 102 | if __name__ == "__main__": 103 | parser = launch_cli(description="HF-Optimum Nvidia Benchmark.") 104 | args = parser.parse_args() 105 | 106 | model_folder = "/mnt/benchmarks/models" 107 | model_name = ( 108 | f"{args.model_name}-2-7b-chat-optimum" 109 | if args.model_name == "llama" 110 | else f"{args.model_name}-7b-v0.1-instruct-optimum" 111 | ) 112 | 113 | runner_dict = { 114 | "cuda": [ 115 | { 116 | "precision": "float32", 117 | "model_path": os.path.join(model_folder, model_name + "-float32"), 118 | }, 119 | { 120 | "precision": "float16", 121 | "model_path": os.path.join(model_folder, model_name + "-float16"), 122 | }, 123 | ] 124 | } 125 | 126 | make_report( 127 | args=args, 128 | benchmark_class=OptimumBenchmark, 129 | runner_dict=runner_dict, 130 | benchmark_name="HF-Optimum Nvidia", 131 | is_bench_pytorch=False, 132 | ) 133 | -------------------------------------------------------------------------------- /bench_optimum_nvidia/bench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ######################################################################################################## 4 | # Script: bench.sh 5 | # Description: This script runs benchmarks HF Optimum Nvidia benchmark. 6 | # 7 | # Usage: ./bench.sh [OPTIONS] 8 | # OPTIONS: 9 | # -p, --prompt Prompt for benchmarks (default: 'Write an essay about the transformer model architecture') 10 | # -r, --repetitions Number of repetitions for benchmarks (default: 10) 11 | # -m, --max_tokens Maximum number of tokens for benchmarks (default: 512) 12 | # -d, --device Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda') 13 | # -n, --model_name The name of the model to benchmark (possible values: 'llama' for using Llama2, 'mistral' for using Mistral 7B v0.1) 14 | # -lf, --log_file Logging file name. 15 | # -h, --help Show this help message 16 | ######################################################################################################## 17 | 18 | set -euo pipefail 19 | 20 | print_usage() { 21 | echo "Usage: $0 [OPTIONS]" 22 | echo "OPTIONS:" 23 | echo " -p, --prompt Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')" 24 | echo " -r, --repetitions Number of repetitions for benchmarks (default: 10)" 25 | echo " -m, --max_tokens Maximum number of tokens for benchmarks (default: 512)" 26 | echo " -d, --device Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')" 27 | echo " -n, --model_name The name of the model to benchmark (possible values: 'llama' for using Llama2, 'mistral' for using Mistral 7B v0.1)" 28 | echo " -lf, --log_file Logging file name." 29 | echo " -h, --help Show this help message" 30 | exit 1 31 | } 32 | 33 | CURRENT_DIR="$(pwd)" 34 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 35 | 36 | check_cuda() { 37 | if command -v nvcc &> /dev/null 38 | then 39 | echo -e "\nUsing CUDA" 40 | nvcc --version 41 | else 42 | echo -e "\nCUDA is not available." 43 | exit 1 44 | fi 45 | } 46 | 47 | check_platform() { 48 | local platform 49 | platform=$(uname -s) 50 | if [[ "$platform" == "Linux" ]]; then 51 | echo "Running on Linux." 52 | elif [[ "$platform" == "Darwin" ]]; then 53 | echo "Running on Mac OS." 54 | else 55 | echo "Unknown platform." 56 | exit 1 57 | fi 58 | } 59 | 60 | setup() { 61 | local MODEL_NAME="${1:-llama}" 62 | echo -e "\nSetting up with $SCRIPT_DIR/setup.sh..." 63 | bash "$SCRIPT_DIR/setup.sh" "$MODEL_NAME" 64 | } 65 | 66 | # Parse command-line arguments 67 | while [ "$#" -gt 0 ]; do 68 | case "$1" in 69 | -p|--prompt) 70 | PROMPT="$2" 71 | shift 2 72 | ;; 73 | -r|--repetitions) 74 | REPETITIONS="$2" 75 | shift 2 76 | ;; 77 | -m|--max_tokens) 78 | MAX_TOKENS="$2" 79 | shift 2 80 | ;; 81 | -d|--device) 82 | DEVICE="$2" 83 | case "$DEVICE" in 84 | "cuda" | "metal" | "cpu") 85 | ;; 86 | *) 87 | echo "Invalid value for --device. Please use 'cuda', 'cpu' or 'metal'." 88 | print_usage 89 | ;; 90 | esac 91 | if [ "$DEVICE" == "cuda" ]; then 92 | check_cuda 93 | else 94 | echo "Not supported for $DEVICE" 95 | exit 1 96 | fi 97 | shift 2 98 | ;; 99 | -n|--model_name) 100 | MODEL_NAME="$2" 101 | shift 2 102 | ;; 103 | -h|--help) 104 | print_usage 105 | ;; 106 | *) 107 | echo "Unknown option: $1" 108 | print_usage 109 | ;; 110 | esac 111 | done 112 | 113 | check_platform 114 | setup "$MODEL_NAME" 115 | 116 | # Set default values if not provided 117 | PROMPT="${PROMPT:-"Write an essay about the transformer model architecture"}" 118 | REPETITIONS="${REPETITIONS:-10}" 119 | MAX_TOKENS="${MAX_TOKENS:-512}" 120 | DEVICE="${DEVICE:-'cuda'}" 121 | MODEL_NAME="${MODEL_NAME:-"llama"}" 122 | 123 | 124 | docker run \ 125 | --gpus all \ 126 | --ipc=host \ 127 | --ulimit memlock=-1 \ 128 | --ulimit stack=67108864 \ 129 | -e PYTHONUNBUFFERED=1 \ 130 | -v "$CURRENT_DIR:/mnt/benchmarks" \ 131 | -it huggingface/optimum-nvidia:latest \ 132 | python3 -u "/mnt/benchmarks/bench_optimum_nvidia/bench.py" \ 133 | --prompt "$PROMPT" \ 134 | --repetitions "$REPETITIONS" \ 135 | --max_tokens "$MAX_TOKENS" \ 136 | --model_name "$MODEL_NAME" \ 137 | --device "$DEVICE" 138 | -------------------------------------------------------------------------------- /bench_tensorrtllm/bench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ######################################################################################################## 4 | # Script: bench.sh 5 | # Description: This script runs benchmarks Nvidia TensorRT LLM benchmark. 6 | # 7 | # Usage: ./bench.sh [OPTIONS] 8 | # OPTIONS: 9 | # -p, --prompt Prompt for benchmarks (default: 'Write an essay about the transformer model architecture') 10 | # -r, --repetitions Number of repetitions for benchmarks (default: 10) 11 | # -m, --max_tokens Maximum number of tokens for benchmarks (default: 512) 12 | # -d, --device Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda') 13 | # -n, --model_name The name of the model to benchmark (possible values: 'llama' for using Llama2, 'mistral' for using Mistral 7B v0.1) 14 | # -lf, --log_file Logging file name. 15 | # -h, --help Show this help message 16 | ######################################################################################################## 17 | 18 | set -euo pipefail 19 | 20 | print_usage() { 21 | echo "Usage: $0 [OPTIONS]" 22 | echo "OPTIONS:" 23 | echo " -p, --prompt Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')" 24 | echo " -r, --repetitions Number of repetitions for benchmarks (default: 10)" 25 | echo " -m, --max_tokens Maximum number of tokens for benchmarks (default: 512)" 26 | echo " -d, --device Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')" 27 | echo " -n, --model_name The name of the model to benchmark (possible values: 'llama' for using Llama2, 'mistral' for using Mistral 7B v0.1)" 28 | echo " -lf, --log_file Logging file name." 29 | echo " -h, --help Show this help message" 30 | exit 1 31 | } 32 | 33 | CURRENT_DIR="$(pwd)" 34 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 35 | 36 | check_cuda() { 37 | if command -v nvcc &> /dev/null 38 | then 39 | echo -e "\nUsing CUDA" 40 | nvcc --version 41 | else 42 | echo -e "\nCUDA is not available." 43 | exit 1 44 | fi 45 | } 46 | 47 | check_platform() { 48 | local platform 49 | platform=$(uname -s) 50 | if [[ "$platform" == "Linux" ]]; then 51 | echo "Running on Linux." 52 | elif [[ "$platform" == "Darwin" ]]; then 53 | echo "Running on Mac OS." 54 | else 55 | echo "Unknown platform." 56 | exit 1 57 | fi 58 | } 59 | 60 | setup() { 61 | local MODEL_NAME="${1:-llama}" 62 | echo -e "\nSetting up with $SCRIPT_DIR/setup.sh..." 63 | bash "$SCRIPT_DIR/setup.sh" "$MODEL_NAME" 64 | } 65 | 66 | 67 | # Parse command-line arguments 68 | while [ "$#" -gt 0 ]; do 69 | case "$1" in 70 | -p|--prompt) 71 | PROMPT="$2" 72 | shift 2 73 | ;; 74 | -r|--repetitions) 75 | REPETITIONS="$2" 76 | shift 2 77 | ;; 78 | -m|--max_tokens) 79 | MAX_TOKENS="$2" 80 | shift 2 81 | ;; 82 | -d|--device) 83 | DEVICE="$2" 84 | case "$DEVICE" in 85 | "cuda" | "metal" | "cpu") 86 | ;; 87 | *) 88 | echo "Invalid value for --device. Please use 'cuda', 'cpu' or 'metal'." 89 | print_usage 90 | ;; 91 | esac 92 | if [ "$DEVICE" == "cuda" ]; then 93 | check_cuda 94 | else 95 | echo "Not supported for $DEVICE" 96 | exit 1 97 | fi 98 | shift 2 99 | ;; 100 | -n|--model_name) 101 | MODEL_NAME="$2" 102 | shift 2 103 | ;; 104 | -h|--help) 105 | print_usage 106 | ;; 107 | *) 108 | echo "Unknown option: $1" 109 | print_usage 110 | ;; 111 | esac 112 | done 113 | 114 | check_platform 115 | check_python 116 | setup "$MODEL_NAME" 117 | 118 | # Set default values if not provided 119 | PROMPT="${PROMPT:-"Write an essay about the transformer model architecture"}" 120 | REPETITIONS="${REPETITIONS:-10}" 121 | MAX_TOKENS="${MAX_TOKENS:-512}" 122 | DEVICE="${DEVICE:-'cuda'}" 123 | MODEL_NAME="${MODEL_NAME:-"llama"}" 124 | 125 | 126 | docker run \ 127 | --gpus all \ 128 | --ipc=host \ 129 | --ulimit memlock=-1 \ 130 | --ulimit stack=67108864 \ 131 | -e PYTHONUNBUFFERED=1 \ 132 | -v "$CURRENT_DIR:/mnt/benchmarks" \ 133 | -it tensorrt_llm/release:latest \ 134 | python3 -u "/mnt/benchmarks/bench_tensorrtllm/bench.py" \ 135 | --prompt "$PROMPT" \ 136 | --repetitions "$REPETITIONS" \ 137 | --max_tokens "$MAX_TOKENS" \ 138 | --model_name "$MODEL_NAME" \ 139 | --device "$DEVICE" 140 | -------------------------------------------------------------------------------- /bench_onnxruntime/bench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ######################################################################################################## 4 | # Script: bench.sh 5 | # Description: This script runs benchmarks HF Optimum ONNX benchmark. 6 | # 7 | # Usage: ./bench.sh [OPTIONS] 8 | # OPTIONS: 9 | # -p, --prompt Prompt for benchmarks (default: 'Write an essay about the transformer model architecture') 10 | # -r, --repetitions Number of repetitions for benchmarks (default: 10) 11 | # -m, --max_tokens Maximum number of tokens for benchmarks (default: 512) 12 | # -d, --device Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda') 13 | # -n, --model_name The name of the model to benchmark (possible values: 'llama' for using Llama2, 'mistral' for using Mistral 7B v0.1) 14 | # -lf, --log_file Logging file name. 15 | # -h, --help Show this help message 16 | ######################################################################################################## 17 | 18 | set -euo pipefail 19 | 20 | print_usage() { 21 | echo "Usage: $0 [OPTIONS]" 22 | echo "OPTIONS:" 23 | echo " -p, --prompt Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')" 24 | echo " -r, --repetitions Number of repetitions for benchmarks (default: 10)" 25 | echo " -m, --max_tokens Maximum number of tokens for benchmarks (default: 512)" 26 | echo " -d, --device Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')" 27 | echo " -n, --model_name The name of the model to benchmark (possible values: 'llama' for using Llama2, 'mistral' for using Mistral 7B v0.1)" 28 | echo " -lf, --log_file Logging file name." 29 | echo " -h, --help Show this help message" 30 | exit 1 31 | } 32 | 33 | CURRENT_DIR="$(pwd)" 34 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 35 | 36 | 37 | check_cuda() { 38 | if command -v nvcc &> /dev/null 39 | then 40 | echo -e "\nUsing CUDA" 41 | nvcc --version 42 | else 43 | echo -e "\nCUDA is not available." 44 | exit 1 45 | fi 46 | } 47 | 48 | check_platform() { 49 | local platform 50 | platform=$(uname -s) 51 | if [[ "$platform" == "Linux" ]]; then 52 | echo "Running on Linux." 53 | elif [[ "$platform" == "Darwin" ]]; then 54 | echo "Running on Mac OS." 55 | else 56 | echo "Unknown platform." 57 | exit 1 58 | fi 59 | } 60 | 61 | setup() { 62 | local MODEL_NAME="${1:-llama}" 63 | local DEVICE="$2" 64 | 65 | echo -e "\nSetting up with $SCRIPT_DIR/setup.sh..." 66 | bash "$SCRIPT_DIR/setup.sh" "$MODEL_NAME" "$DEVICE" 67 | } 68 | 69 | # Parse command-line arguments 70 | while [ "$#" -gt 0 ]; do 71 | case "$1" in 72 | -p|--prompt) 73 | PROMPT="$2" 74 | shift 2 75 | ;; 76 | -r|--repetitions) 77 | REPETITIONS="$2" 78 | shift 2 79 | ;; 80 | -m|--max_tokens) 81 | MAX_TOKENS="$2" 82 | shift 2 83 | ;; 84 | -d|--device) 85 | DEVICE="$2" 86 | case "$DEVICE" in 87 | "cuda" | "metal" | "cpu") 88 | ;; 89 | *) 90 | echo "Invalid value for --device. Please use 'cuda', 'cpu' or 'metal'." 91 | print_usage 92 | ;; 93 | esac 94 | if [ "$DEVICE" == "cuda" ]; then 95 | check_cuda 96 | else 97 | echo "Not supported for $DEVICE" 98 | exit 1 99 | fi 100 | shift 2 101 | ;; 102 | -n|--model_name) 103 | MODEL_NAME="$2" 104 | shift 2 105 | ;; 106 | -h|--help) 107 | print_usage 108 | ;; 109 | *) 110 | echo "Unknown option: $1" 111 | print_usage 112 | ;; 113 | esac 114 | done 115 | 116 | check_platform 117 | check_python 118 | setup "$MODEL_NAME" "$DEVICE" 119 | 120 | # Set default values if not provided 121 | PROMPT="${PROMPT:-"Write an essay about the transformer model architecture"}" 122 | REPETITIONS="${REPETITIONS:-10}" 123 | MAX_TOKENS="${MAX_TOKENS:-512}" 124 | DEVICE="${DEVICE:-'cuda'}" 125 | MODEL_NAME="${MODEL_NAME:-"llama"}" 126 | 127 | docker run \ 128 | --gpus all \ 129 | --ipc=host \ 130 | --ulimit memlock=-1 \ 131 | --ulimit stack=67108864 \ 132 | -e PYTHONUNBUFFERED=1 \ 133 | -v "$CURRENT_DIR:/mnt/benchmarks" \ 134 | -it anindyadeep/onnxruntime:latest \ 135 | python3 -u "/mnt/benchmarks/bench_onnxruntime/bench.py" \ 136 | --prompt "$PROMPT" \ 137 | --repetitions "$REPETITIONS" \ 138 | --max_tokens "$MAX_TOKENS" \ 139 | --model_name "$MODEL_NAME" \ 140 | --device "$DEVICE" 141 | -------------------------------------------------------------------------------- /bench_vllm/bench.py: -------------------------------------------------------------------------------- 1 | import gc 2 | import os 3 | import sys 4 | 5 | import torch 6 | from transformers import AutoTokenizer 7 | from vllm import LLM, SamplingParams 8 | from vllm.model_executor.parallel_utils import parallel_state 9 | 10 | sys.path.append(os.getcwd()) 11 | 12 | from common.base import BaseBenchmarkClass # noqa 13 | from common.utils import launch_cli, make_report # noqa 14 | 15 | 16 | class VLLMBenchmark(BaseBenchmarkClass): 17 | def __init__( 18 | self, 19 | model_path: str, 20 | model_name: str, 21 | benchmark_name: str, 22 | precision: str, 23 | device: str, 24 | experiment_name: str, 25 | ) -> None: 26 | assert device == "cuda", ValueError("Only supported device is 'cuda'") 27 | assert precision in ["float16", "float32", "int4"], ValueError( 28 | "supported precision are: 'float16', 'float32' and 'int4'" 29 | ) 30 | 31 | super().__init__( 32 | model_name=model_name, 33 | model_path=model_path, 34 | benchmark_name=benchmark_name, 35 | experiment_name=experiment_name, 36 | precision=precision, 37 | device=device, 38 | ) 39 | 40 | if model_name == "llama": 41 | self.tokenizer_folder = os.path.join( 42 | os.getcwd(), "models", "llama-2-7b-chat-hf" 43 | ) 44 | else: 45 | self.tokenizer_folder = os.path.join( 46 | os.getcwd(), "models", "mistral-7b-v0.1-instruct-hf" 47 | ) 48 | 49 | def load_model_and_tokenizer(self): 50 | if self.precision == "int4": 51 | self.model = LLM( 52 | model=self.model_path, quantization="AWQ", tensor_parallel_size=1 53 | ) 54 | else: 55 | self.model = LLM(model=self.model_path) 56 | self.model.dtype = self.precision 57 | self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_folder) 58 | return self 59 | 60 | def preprocess( 61 | self, prompt: str, chat_mode: bool = True, for_benchmarks: bool = True 62 | ): 63 | if chat_mode: 64 | template = self.get_chat_template_with_instruction( 65 | prompt=prompt, for_benchmarks=for_benchmarks 66 | ) 67 | prompt = self.tokenizer.apply_chat_template(template, tokenize=False) 68 | 69 | tokenized_input = self.tokenizer.encode(text=prompt) 70 | return { 71 | "prompt": prompt, 72 | "input_tokens": tokenized_input, 73 | "tensor": None, 74 | "num_input_tokens": len(tokenized_input), 75 | } 76 | 77 | def run_model(self, inputs: dict, max_tokens: int, temperature: float) -> dict: 78 | prompt = [inputs["prompt"]] 79 | 80 | sampling_params = SamplingParams(max_tokens=max_tokens, temperature=temperature) 81 | output = self.model.generate(prompt, sampling_params) 82 | 83 | generated_text = output[0].outputs[0].text 84 | generated_tokens = output[0].outputs[0].token_ids 85 | 86 | return { 87 | "output_tokens": generated_tokens, 88 | "num_output_tokens": len(generated_tokens), 89 | "output_prompt": generated_text, 90 | } 91 | 92 | def postprocess(self, output: dict) -> str: 93 | return output["output_prompt"] 94 | 95 | def on_exit(self): 96 | if self.device == "cuda": 97 | parallel_state.destroy_model_parallel() 98 | del self.model 99 | gc.collect() 100 | torch.cuda.empty_cache() 101 | torch.distributed.destroy_process_group() 102 | torch.cuda.synchronize() 103 | else: 104 | del self.model 105 | 106 | 107 | if __name__ == "__main__": 108 | parser = launch_cli(description="vLLM Benchmark.") 109 | args = parser.parse_args() 110 | 111 | model_folder = os.path.join(os.getcwd(), "models") 112 | model_name = ( 113 | f"{args.model_name}-2-7b-chat-" 114 | if args.model_name == "llama" 115 | else f"{args.model_name}-7b-v0.1-instruct-" 116 | ) 117 | 118 | runner_dict = { 119 | "cuda": [ 120 | { 121 | "precision": "float32", 122 | "model_path": os.path.join(model_folder, model_name + "hf"), 123 | }, 124 | { 125 | "precision": "float16", 126 | "model_path": os.path.join(model_folder, model_name + "hf"), 127 | }, 128 | { 129 | "precision": "int4", 130 | "model_path": os.path.join(model_folder, model_name + "autoawq"), 131 | }, 132 | ] 133 | } 134 | 135 | make_report( 136 | args=args, 137 | benchmark_class=VLLMBenchmark, 138 | runner_dict=runner_dict, 139 | benchmark_name="vLLM", 140 | is_bench_pytorch=False, 141 | ) 142 | -------------------------------------------------------------------------------- /bench_pytorch/bench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ######################################################################################################## 4 | # Script: bench.sh 5 | # Description: This script runs benchmarks AutoAWQ llama benchmark. 6 | # 7 | # Usage: ./bench.sh [OPTIONS] 8 | # OPTIONS: 9 | # -p, --prompt Prompt for benchmarks (default: 'Write an essay about the transformer model architecture') 10 | # -r, --repetitions Number of repetitions for benchmarks (default: 10) 11 | # -m, --max_tokens Maximum number of tokens for benchmarks (default: 512) 12 | # -d, --device Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda') 13 | # -n, --model_name The name of the model to benchmark (possible values: 'llama' for using Llama2, 'mistral' for using Mistral 7B v0.1) 14 | # -lf, --log_file Logging file name. 15 | # -h, --help Show this help message 16 | ######################################################################################################## 17 | 18 | set -euo pipefail 19 | 20 | print_usage() { 21 | echo "Usage: $0 [OPTIONS]" 22 | echo "OPTIONS:" 23 | echo " -p, --prompt Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')" 24 | echo " -r, --repetitions Number of repetitions for benchmarks (default: 10)" 25 | echo " -m, --max_tokens Maximum number of tokens for benchmarks (default: 512)" 26 | echo " -d, --device Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')" 27 | echo " -n, --model_name The name of the model to benchmark (possible values: 'llama' for using Llama2, 'mistral' for using Mistral 7B v0.1)" 28 | echo " -lf, --log_file Logging file name." 29 | echo " -h, --help Show this help message" 30 | exit 1 31 | } 32 | 33 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 34 | 35 | check_cuda() { 36 | if command -v nvcc &> /dev/null 37 | then 38 | echo -e "\nUsing CUDA" 39 | nvcc --version 40 | else 41 | echo -e "\nCUDA is not available." 42 | exit 1 43 | fi 44 | } 45 | 46 | check_platform() { 47 | local platform 48 | platform=$(uname -s) 49 | if [[ "$platform" == "Linux" ]]; then 50 | echo "Running on Linux." 51 | elif [[ "$platform" == "Darwin" ]]; then 52 | echo "Running on Mac OS." 53 | else 54 | echo "Unknown platform." 55 | exit 1 56 | fi 57 | } 58 | 59 | check_python() { 60 | if command -v python &> /dev/null; then 61 | PYTHON_CMD="python" 62 | elif command -v python3 &> /dev/null; then 63 | PYTHON_CMD="python3" 64 | else 65 | echo "Python is not installed." 66 | exit 1 67 | fi 68 | } 69 | 70 | setup() { 71 | echo -e "\nSetting up with $SCRIPT_DIR/setup.sh..." 72 | bash "$SCRIPT_DIR"/setup.sh 73 | } 74 | 75 | run_benchmarks() { 76 | local PROMPT="$1" 77 | local REPETITIONS="$2" 78 | local MAX_TOKENS="$3" 79 | local DEVICE="$4" 80 | local MODEL_NAME="$5" 81 | 82 | # shellcheck disable=SC1091 83 | source "$SCRIPT_DIR/venv/bin/activate" 84 | "$PYTHON_CMD" "$SCRIPT_DIR"/bench.py \ 85 | --prompt "$PROMPT" \ 86 | --repetitions "$REPETITIONS" \ 87 | --max_tokens "$MAX_TOKENS" \ 88 | --model_name "$MODEL_NAME" \ 89 | --device "$DEVICE" 90 | } 91 | 92 | # Parse command-line arguments 93 | while [ "$#" -gt 0 ]; do 94 | case "$1" in 95 | -p|--prompt) 96 | PROMPT="$2" 97 | shift 2 98 | ;; 99 | -r|--repetitions) 100 | REPETITIONS="$2" 101 | shift 2 102 | ;; 103 | -m|--max_tokens) 104 | MAX_TOKENS="$2" 105 | shift 2 106 | ;; 107 | -d|--device) 108 | DEVICE="$2" 109 | case "$DEVICE" in 110 | "cuda" | "metal" | "cpu") 111 | ;; 112 | *) 113 | echo "Invalid value for --device. Please use 'cuda', 'cpu' or 'metal'." 114 | print_usage 115 | ;; 116 | esac 117 | if [ "$DEVICE" == "cuda" ]; then 118 | check_cuda 119 | else 120 | echo "Not supported for $DEVICE" 121 | exit 1 122 | fi 123 | shift 2 124 | ;; 125 | -n|--model_name) 126 | MODEL_NAME="$2" 127 | shift 2 128 | ;; 129 | -h|--help) 130 | print_usage 131 | ;; 132 | *) 133 | echo "Unknown option: $1" 134 | print_usage 135 | ;; 136 | esac 137 | done 138 | 139 | check_platform 140 | check_python 141 | setup 142 | 143 | # Set default values if not provided 144 | PROMPT="${PROMPT:-"Write an essay about the transformer model architecture"}" 145 | REPETITIONS="${REPETITIONS:-10}" 146 | MAX_TOKENS="${MAX_TOKENS:-512}" 147 | DEVICE="${DEVICE:-'cuda'}" 148 | MODEL_NAME="${MODEL_NAME:-"llama"}" 149 | 150 | run_benchmarks "$PROMPT" "$REPETITIONS" "$MAX_TOKENS" "$DEVICE" "$MODEL_NAME" 151 | -------------------------------------------------------------------------------- /bench_ctransformers/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ################################################################################ 4 | # Script: setup.sh 5 | # Description: Automates the setup of a virtual environment and installs project 6 | # requirements including CTransformers and GGUF weights. 7 | ################################################################################ 8 | 9 | set -euo pipefail 10 | 11 | # Define constants and paths 12 | CURRENT_DIR="$(pwd)" 13 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 14 | VENV_DIR="$SCRIPT_DIR/venv" 15 | MODELS_DIR="$CURRENT_DIR/models" 16 | LLAMA2_GGUF_WEIGHTS_DIR="$MODELS_DIR/llama-2-7b-chat-gguf" 17 | MISTRAL_GGUF_WEIGHTS_DIR="$MODELS_DIR/mistral-7b-v0.1-instruct-gguf" 18 | 19 | # Check if Python is installed 20 | check_python() { 21 | if command -v python &> /dev/null; then 22 | PYTHON_CMD="python" 23 | elif command -v python3 &> /dev/null; then 24 | PYTHON_CMD="python3" 25 | else 26 | echo "Python is not installed." 27 | exit 1 28 | fi 29 | } 30 | 31 | install_ctransformers_cuda() { 32 | CUDA_VERSION=$(nvcc --version | grep "release" | sed -n 's/.*release \(.*\),.*/\1/p') 33 | 34 | if [ -z "$CUDA_VERSION" ]; then 35 | echo "CUDA is not installed or not found." 36 | exit 1 37 | fi 38 | 39 | CUDA_MAJOR=$(echo "$CUDA_VERSION" | cut -d. -f1) 40 | CUDA_MINOR=$(echo "$CUDA_VERSION" | cut -d. -f2) 41 | 42 | if [ "$CUDA_MAJOR" -gt 12 ] || { [ "$CUDA_MAJOR" -eq 12 ] && [ "$CUDA_MINOR" -ge 2 ]; }; then 43 | echo "Detected CUDA version >= 12.2" 44 | pip install ctransformers[cuda] > /dev/null 45 | else 46 | echo "Detected CUDA version < 12.2" 47 | CMAKE_ARGS="-DCMAKE_CUDA_COMPILER=$(which nvcc)" CT_CUBLAS=1 pip install ctransformers --no-binary ctransformers > /dev/null 48 | fi 49 | } 50 | 51 | # Install CTransformers based on the specified device 52 | install_ctransformers() { 53 | local DEVICE="$1" 54 | 55 | case "$DEVICE" in 56 | cuda) 57 | echo "Installing CTransformers for CUDA." 58 | install_ctransformers_cuda 59 | ;; 60 | metal) 61 | echo "Installing CTransformers for Metal." 62 | pip uninstall ctransformers --yes 63 | CT_METAL=1 pip install ctransformers --no-binary ctransformers 64 | ;; 65 | cpu) 66 | echo "Installing CTransformers for CPU." 67 | pip install ctransformers > /dev/null 68 | ;; 69 | *) 70 | echo "Unsupported DEVICE: $DEVICE" 71 | exit 1 72 | ;; 73 | esac 74 | } 75 | 76 | # Download GGUF weights for the specified model 77 | download_gguf_weights() { 78 | local MODEL_NAME="$1" 79 | local DOWNLOAD_DIR 80 | 81 | case "$MODEL_NAME" in 82 | llama) 83 | DOWNLOAD_DIR="$LLAMA2_GGUF_WEIGHTS_DIR" 84 | MODEL_IDENTIFIER="TheBloke/Llama-2-7B-Chat-GGUF" 85 | MODEL_FILE_4BIT="llama-2-7b-chat.Q4_K_M.gguf" 86 | MODEL_FILE_8BIT="llama-2-7b-chat.Q8_0.gguf" 87 | ;; 88 | mistral) 89 | DOWNLOAD_DIR="$MISTRAL_GGUF_WEIGHTS_DIR" 90 | MODEL_IDENTIFIER="TheBloke/Mistral-7B-Instruct-v0.1-GGUF" 91 | MODEL_FILE_4BIT="mistral-7b-instruct-v0.1.Q4_K_M.gguf" 92 | MODEL_FILE_8BIT="mistral-7b-instruct-v0.1.Q8_0.gguf" 93 | ;; 94 | *) 95 | echo "Invalid MODEL_NAME. Supported values: 'llama', 'mistral'" 96 | exit 1 97 | ;; 98 | esac 99 | 100 | if [ ! -d "$DOWNLOAD_DIR" ]; then 101 | huggingface-cli download "$MODEL_IDENTIFIER" "$MODEL_FILE_4BIT" --local-dir "$DOWNLOAD_DIR" --local-dir-use-symlinks False 102 | huggingface-cli download "$MODEL_IDENTIFIER" "$MODEL_FILE_8BIT" --local-dir "$DOWNLOAD_DIR" --local-dir-use-symlinks False 103 | else 104 | echo "Weights for $MODEL_NAME already downloaded." 105 | fi 106 | } 107 | 108 | # Main script starts here 109 | 110 | if [ "$#" -ne 2 ]; then 111 | echo "Usage: $0 " 112 | exit 1 113 | fi 114 | 115 | check_python 116 | 117 | # Define command line arguments 118 | DEVICE="$1" 119 | MODEL_NAME="$2" 120 | 121 | if [ ! -d "$VENV_DIR" ]; then 122 | "$PYTHON_CMD" -m venv "$VENV_DIR" 123 | echo "Virtual environment '$VENV_DIR' created." 124 | if [ -f "$VENV_DIR/bin/activate" ]; then 125 | # shellcheck disable=SC1091 126 | source "$VENV_DIR/bin/activate" 127 | else 128 | echo "Error: Unable to find virtual environment activation script." 129 | exit 1 130 | fi 131 | 132 | "$PYTHON_CMD" -m pip install --upgrade pip > /dev/null 133 | "$PYTHON_CMD" -m pip install -r "$SCRIPT_DIR/requirements.txt" --no-cache-dir > /dev/null 134 | install_ctransformers "$DEVICE" 135 | else 136 | if [ -f "$VENV_DIR/bin/activate" ]; then 137 | # shellcheck disable=SC1091 138 | source "$VENV_DIR/bin/activate" 139 | else 140 | echo "Error: Unable to find virtual environment activation script." 141 | exit 1 142 | fi 143 | fi 144 | 145 | 146 | download_gguf_weights "$MODEL_NAME" 147 | -------------------------------------------------------------------------------- /bench_deepspeed/bench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ######################################################################################################## 4 | # Script: bench.sh 5 | # Description: This script runs DeepSpeed benchmark for Llama 2 Chat and Mistral v0.1 Instruct 6 | # 7 | # Usage: ./bench.sh [OPTIONS] 8 | # OPTIONS: 9 | # -p, --prompt Prompt for benchmarks (default: 'Write an essay about the transformer model architecture') 10 | # -r, --repetitions Number of repetitions for benchmarks (default: 10) 11 | # -m, --max_tokens Maximum number of tokens for benchmarks (default: 512) 12 | # -d, --device Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda') 13 | # -n, --model_name The name of the model to benchmark (possible values: 'llama' for using Llama2, 'mistral' for using Mistral 7B v0.1) 14 | # -lf, --log_file Logging file name. 15 | # -h, --help Show this help message 16 | ######################################################################################################## 17 | 18 | set -euo pipefail 19 | 20 | print_usage() { 21 | echo "Usage: $0 [OPTIONS]" 22 | echo "OPTIONS:" 23 | echo " -p, --prompt Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')" 24 | echo " -r, --repetitions Number of repetitions for benchmarks (default: 10)" 25 | echo " -m, --max_tokens Maximum number of tokens for benchmarks (default: 512)" 26 | echo " -d, --device Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')" 27 | echo " -n, --model_name The name of the model to benchmark (possible values: 'llama' for using Llama2, 'mistral' for using Mistral 7B v0.1)" 28 | echo " -lf, --log_file Logging file name." 29 | echo " -h, --help Show this help message" 30 | exit 1 31 | } 32 | 33 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 34 | 35 | 36 | check_cuda() { 37 | if command -v nvcc &> /dev/null 38 | then 39 | echo -e "\nUsing CUDA" 40 | nvcc --version 41 | else 42 | echo -e "\nCUDA is not available." 43 | exit 1 44 | fi 45 | } 46 | 47 | check_platform() { 48 | local platform 49 | platform=$(uname -s) 50 | if [[ "$platform" == "Linux" ]]; then 51 | echo "Running on Linux." 52 | elif [[ "$platform" == "Darwin" ]]; then 53 | echo "Running on Mac OS." 54 | else 55 | echo "Unknown platform." 56 | exit 1 57 | fi 58 | } 59 | 60 | check_python() { 61 | if command -v python &> /dev/null; then 62 | PYTHON_CMD="python" 63 | elif command -v python3 &> /dev/null; then 64 | PYTHON_CMD="python3" 65 | else 66 | echo "Python is not installed." 67 | exit 1 68 | fi 69 | } 70 | 71 | setup() { 72 | echo -e "\nSetting up with $SCRIPT_DIR/setup.sh..." 73 | bash "$SCRIPT_DIR"/setup.sh 74 | } 75 | 76 | run_benchmarks() { 77 | local PROMPT="$1" 78 | local REPETITIONS="$2" 79 | local MAX_TOKENS="$3" 80 | local DEVICE="$4" 81 | local MODEL_NAME="$5" 82 | 83 | # shellcheck disable=SC1091 84 | source "$SCRIPT_DIR/venv/bin/activate" 85 | "$PYTHON_CMD" "$SCRIPT_DIR"/bench.py \ 86 | --prompt "$PROMPT" \ 87 | --repetitions "$REPETITIONS" \ 88 | --max_tokens "$MAX_TOKENS" \ 89 | --model_name "$MODEL_NAME" \ 90 | --device "$DEVICE" 91 | } 92 | 93 | while [ "$#" -gt 0 ]; do 94 | case "$1" in 95 | -p|--prompt) 96 | PROMPT="$2" 97 | shift 2 98 | ;; 99 | -r|--repetitions) 100 | REPETITIONS="$2" 101 | shift 2 102 | ;; 103 | -m|--max_tokens) 104 | MAX_TOKENS="$2" 105 | shift 2 106 | ;; 107 | -d|--device) 108 | DEVICE="$2" 109 | case "$DEVICE" in 110 | "cuda" | "metal" | "cpu") 111 | ;; 112 | *) 113 | echo "Invalid value for --device. Please use 'cuda', 'cpu' or 'metal'." 114 | print_usage 115 | ;; 116 | esac 117 | if [ "$DEVICE" == "cuda" ]; then 118 | check_cuda 119 | else 120 | echo "Not supported for $DEVICE" 121 | exit 1 122 | fi 123 | shift 2 124 | ;; 125 | -n|--model_name) 126 | MODEL_NAME="$2" 127 | shift 2 128 | ;; 129 | -h|--help) 130 | print_usage 131 | ;; 132 | *) 133 | echo "Unknown option: $1" 134 | print_usage 135 | ;; 136 | esac 137 | done 138 | 139 | 140 | check_platform 141 | check_python 142 | setup 143 | 144 | # Set default values if not provided 145 | PROMPT="${PROMPT:-"Write an essay about the transformer model architecture"}" 146 | REPETITIONS="${REPETITIONS:-10}" 147 | MAX_TOKENS="${MAX_TOKENS:-512}" 148 | DEVICE="${DEVICE:-'cuda'}" 149 | MODEL_NAME="${MODEL_NAME:-"llama"}" 150 | 151 | run_benchmarks "$PROMPT" "$REPETITIONS" "$MAX_TOKENS" "$DEVICE" "$MODEL_NAME" 152 | -------------------------------------------------------------------------------- /bench_exllamav2/bench.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import torch 5 | from exllamav2 import ExLlamaV2, ExLlamaV2Cache 6 | from exllamav2.config import ExLlamaV2Config 7 | from exllamav2.generator import ExLlamaV2BaseGenerator, ExLlamaV2Sampler 8 | from exllamav2.tokenizer.tokenizer import ExLlamaV2Tokenizer 9 | from transformers import AutoTokenizer 10 | 11 | sys.path.append(os.getcwd()) 12 | 13 | from common.base import BaseBenchmarkClass # noqa 14 | from common.utils import launch_cli, make_report # noqa 15 | 16 | 17 | class ExLlamaV2Benchmark(BaseBenchmarkClass): 18 | def __init__( 19 | self, 20 | model_path: str, 21 | model_name: str, 22 | benchmark_name: str, 23 | precision: str, 24 | device: str, 25 | experiment_name: str, 26 | ) -> None: 27 | assert precision in ["int8", "int4"], ValueError( 28 | "Available precision: 'int8', 'int4'" 29 | ) 30 | super().__init__( 31 | model_name=model_name, 32 | model_path=model_path, 33 | benchmark_name=benchmark_name, 34 | experiment_name=experiment_name, 35 | precision=precision, 36 | device=device, 37 | ) 38 | 39 | def load_model_and_tokenizer(self): 40 | # set up model config 41 | self.config = ExLlamaV2Config() 42 | self.config.model_dir = self.model_path 43 | self.config.prepare() 44 | 45 | # set up model and cache 46 | self._model = ExLlamaV2(self.config) 47 | self.cache = ExLlamaV2Cache(self._model, lazy=True) 48 | self._model.load_autosplit(self.cache) 49 | self.tokenizer_exllama = ExLlamaV2Tokenizer(self.config) 50 | self.model = ExLlamaV2BaseGenerator( 51 | self._model, self.cache, self.tokenizer_exllama 52 | ) 53 | self.model.warmup() 54 | 55 | # set up the huggingface tokenizer 56 | self.tokenizer = AutoTokenizer.from_pretrained(self.model_path) 57 | 58 | # set up exllamav2 settings 59 | self.settings = ExLlamaV2Sampler.Settings() 60 | self.settings.disallow_tokens( 61 | self.tokenizer_exllama, [self.tokenizer_exllama.eos_token_id] 62 | ) 63 | return self 64 | 65 | def preprocess( 66 | self, prompt: str, chat_mode: bool = True, for_benchmarks: bool = True 67 | ): 68 | if chat_mode: 69 | template = self.get_chat_template_with_instruction( 70 | prompt=prompt, for_benchmarks=for_benchmarks 71 | ) 72 | prompt = self.tokenizer.apply_chat_template(template, tokenize=False) 73 | tokenized_input = self.tokenizer.encode(text=prompt) 74 | return { 75 | "prompt": prompt, 76 | "input_tokens": tokenized_input, 77 | "tensor": None, 78 | "num_input_tokens": len(tokenized_input), 79 | } 80 | 81 | def run_model(self, inputs: dict, max_tokens: int, temperature: float) -> dict: 82 | # first set up the settings 83 | self.settings.token_repetition_penalty = 1.01 84 | self.settings.temperature = temperature 85 | self.settings.top_k = 50 86 | self.settings.top_p = 0.1 87 | 88 | # now run the model 89 | prompt = inputs["prompt"] 90 | output_text = self.model.generate_simple( 91 | prompt, 92 | self.settings, 93 | max_tokens, 94 | seed=1234, 95 | completion_only=True, 96 | decode_special_tokens=True, 97 | ) 98 | 99 | tokenized_output = self.tokenizer.encode(output_text) 100 | return { 101 | "output_text": output_text, 102 | "output_tokens": tokenized_output, 103 | "num_output_tokens": len(tokenized_output), 104 | } 105 | 106 | def postprocess(self, output: dict) -> str: 107 | return output["output_text"] 108 | 109 | def on_exit(self): 110 | if self.device == "cuda": 111 | del self.model 112 | torch.cuda.synchronize() 113 | else: 114 | del self.model 115 | 116 | 117 | if __name__ == "__main__": 118 | parser = launch_cli(description="ExLlamaV2 Benchmark.") 119 | args = parser.parse_args() 120 | 121 | model_folder = os.path.join(os.getcwd(), "models") 122 | model_name = ( 123 | f"{args.model_name}-2-7b-chat-exllamav2-" 124 | if args.model_name == "llama" 125 | else f"{args.model_name}-7b-v0.1-instruct-exllamav2-" 126 | ) 127 | 128 | runner_dict = { 129 | "cuda": [ 130 | { 131 | "precision": "int4", 132 | "model_path": os.path.join(model_folder, model_name + "4.0-bit"), 133 | }, 134 | { 135 | "precision": "int8", 136 | "model_path": os.path.join(model_folder, model_name + "8.0-bit"), 137 | }, 138 | ] 139 | } 140 | 141 | make_report( 142 | args=args, 143 | benchmark_class=ExLlamaV2Benchmark, 144 | runner_dict=runner_dict, 145 | benchmark_name="ExLlamaV2", 146 | is_bench_pytorch=False, 147 | ) 148 | -------------------------------------------------------------------------------- /docs/ml_engines.md: -------------------------------------------------------------------------------- 1 | # 🔧 ML Engines 2 | 3 | ### Model Framework Support Matrix 4 | 5 | | Engine | Float32 | Float16 | Int8 | Int4 | CUDA | ROCM | Mac M1/M2 | Training | 6 | | ------------------------------------------ | :-----: | :-----: | :---: | :---: | :---: | :---: | :-------: | :------: | 7 | | [candle](/bench_candle/) | ⚠️ | ✅ | ⚠️ | ⚠️ | ✅ | ❌ | 🚧 | ❌ | 8 | | [llama.cpp](/bench_llamacpp/) | ❌ | ❌ | ✅ | ✅ | ✅ | 🚧 | 🚧 | ❌ | 9 | | [ctranslate](/bench_ctranslate/) | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | 🚧 | ❌ | 10 | | [onnx](/bench_onnxruntime/) | ✅ | ✅ | ❌ | ❌ | ✅ | ⚠️ | ❌ | ❌ | 11 | | [transformers (pytorch)](/bench_pytorch/) | ✅ | ✅ | ✅ | ✅ | ✅ | 🚧 | ✅ | ✅ | 12 | | [vllm](/bench_vllm/) | ✅ | ✅ | ❌ | ✅ | ✅ | 🚧 | ❌ | ❌ | 13 | | [exllamav2](/bench_exllamav2/) | ❌ | ❌ | ✅ | ✅ | ✅ | 🚧 | ❌ | ❌ | 14 | | [ctransformers](/bench_ctransformers/) | ❌ | ❌ | ✅ | ✅ | ✅ | 🚧 | 🚧 | ❌ | 15 | | [AutoGPTQ](/bench_autogptq/) | ✅ | ✅ | ⚠️ | ⚠️ | ✅ | ❌ | ❌ | ❌ | 16 | | [AutoAWQ](/bench_autoawq/) | ❌ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | 17 | | [DeepSpeed-MII](/bench_deepspeed/) | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ⚠️ | 18 | | [PyTorch Lightning](/bench_lightning/) | ✅ | ✅ | ✅ | ✅ | ✅ | ⚠️ | ⚠️ | ✅ | 19 | | [Optimum Nvidia](/bench_optimum_nvidia/) | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | 20 | | [Nvidia TensorRT-LLM](/bench_tensorrtllm/) | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | 21 | 22 | 23 | ### Legend: 24 | - ✅ Supported 25 | - ❌ Not Supported 26 | - ⚠️ There is a catch related to this 27 | - 🚧 It is supported but not implemented in this current version 28 | 29 | 30 | ### Some pointers to note: 31 | The names are by the name of engines. Except when the name is `Generic` then it means that the nuance applies to all the engines. 32 | 33 | 34 | | Name | Type | Description | 35 | | ----------------- | ---- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | 36 | | candle | ⚠️ | Metal backend is supported but it gives terrible performance even in small models like Phi2. For AMD ROCM there is no support as per this [issue](https://github.com/huggingface/candle/issues/346). | 37 | | candle | 🚧 | Latest performance for Candle is not implemented. If you want to see the numbers, please check out [archive.md](/docs/archive.md) which contains the benchmark numbers for [Llama 2 7B](https://huggingface.co/meta-llama/Llama-2-7b). | 38 | | ctranslate2 | ⚠️ | ROCM is not supported; however, works are in progress to have this feature on CTranslate2. No support for Mac M1/M2. | 39 | | onnxruntime | ⚠️ | ONNXRuntime in general supports ROCM, but specific to LLMs and ONNXRuntime with HuggingFace Optimum only supports CUDAExecution provider right now. For CPU, it is available but super slow. | 40 | | pytorch lightning | ⚠️ | ROCM is supported but not tested for PyTorch Lightning. See this [issue](https://github.com/Lightning-AI/litgpt/issues/1220). | 41 | | pytorch lightning | ⚠️ | Metal is supported in PyTorch Lightning, but for Llama 2 7B Chat or Mistral 7B, it is super slow. | 42 | | AutoGPTQ | ⚠️ | AutoGPTQ is a weight-only quantization algorithm. Activation still remains in either float32 or float16. We used a 4-bit weight quantized model for our benchmarks experiment. | 43 | | Generic | 🚧 | For all the engines which support metal, please check out [archive.md](/docs/archive.md) which contains the benchmark numbers for [Llama 2 7B](https://huggingface.co/meta-llama/Llama-2-7b). | 44 | | Deepspeed | ⚠️ | [DeepSpeed](https://github.com/microsoft/DeepSpeed) supports training; however, for inference, we have used [DeepSpeed MII](https://github.com/microsoft/DeepSpeed-MII). | 45 | -------------------------------------------------------------------------------- /bench_exllamav2/bench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ######################################################################################################## 4 | # Script: bench.sh 5 | # Description: This script runs benchmarks ExLlamaV2 benchmark. 6 | # 7 | # Usage: ./bench.sh [OPTIONS] 8 | # OPTIONS: 9 | # -p, --prompt Prompt for benchmarks (default: 'Write an essay about the transformer model architecture') 10 | # -r, --repetitions Number of repetitions for benchmarks (default: 10) 11 | # -m, --max_tokens Maximum number of tokens for benchmarks (default: 512) 12 | # -d, --device Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda') 13 | # -n, --model_name The name of the model to benchmark (possible values: 'llama' for using Llama2, 'mistral' for using Mistral 7B v0.1) 14 | # -lf, --log_file Logging file name. 15 | # -h, --help Show this help message 16 | ######################################################################################################## 17 | 18 | set -euo pipefail 19 | 20 | print_usage() { 21 | echo "Usage: $0 [OPTIONS]" 22 | echo "OPTIONS:" 23 | echo " -p, --prompt Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')" 24 | echo " -r, --repetitions Number of repetitions for benchmarks (default: 10)" 25 | echo " -m, --max_tokens Maximum number of tokens for benchmarks (default: 512)" 26 | echo " -d, --device Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')" 27 | echo " -n, --model_name The name of the model to benchmark (possible values: 'llama' for using Llama2, 'mistral' for using Mistral 7B v0.1)" 28 | echo " -lf, --log_file Logging file name." 29 | echo " -h, --help Show this help message" 30 | exit 1 31 | } 32 | 33 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 34 | 35 | check_cuda() { 36 | if command -v nvcc &> /dev/null 37 | then 38 | echo -e "\nUsing CUDA" 39 | nvcc --version 40 | else 41 | echo -e "\nCUDA is not available." 42 | exit 1 43 | fi 44 | } 45 | 46 | check_platform() { 47 | local platform 48 | platform=$(uname -s) 49 | if [[ "$platform" == "Linux" ]]; then 50 | echo "Running on Linux." 51 | elif [[ "$platform" == "Darwin" ]]; then 52 | echo "Running on Mac OS." 53 | else 54 | echo "Unknown platform." 55 | exit 1 56 | fi 57 | } 58 | 59 | check_python() { 60 | if command -v python &> /dev/null; then 61 | PYTHON_CMD="python" 62 | elif command -v python3 &> /dev/null; then 63 | PYTHON_CMD="python3" 64 | else 65 | echo "Python is not installed." 66 | exit 1 67 | fi 68 | } 69 | 70 | setup() { 71 | local MODEL_NAME="${1:-llama}" 72 | echo -e "\nSetting up with $SCRIPT_DIR/setup.sh..." 73 | bash "$SCRIPT_DIR/setup.sh" "$MODEL_NAME" 74 | } 75 | 76 | run_benchmarks() { 77 | local PROMPT="$1" 78 | local REPETITIONS="$2" 79 | local MAX_TOKENS="$3" 80 | local DEVICE="$4" 81 | local MODEL_NAME="$5" 82 | 83 | # shellcheck disable=SC1091 84 | source "$SCRIPT_DIR/venv/bin/activate" 85 | "$PYTHON_CMD" "$SCRIPT_DIR"/bench.py \ 86 | --prompt "$PROMPT" \ 87 | --repetitions "$REPETITIONS" \ 88 | --max_tokens "$MAX_TOKENS" \ 89 | --model_name "$MODEL_NAME" \ 90 | --device "$DEVICE" 91 | } 92 | 93 | 94 | while [ "$#" -gt 0 ]; do 95 | case "$1" in 96 | -p|--prompt) 97 | PROMPT="$2" 98 | shift 2 99 | ;; 100 | -r|--repetitions) 101 | REPETITIONS="$2" 102 | shift 2 103 | ;; 104 | -m|--max_tokens) 105 | MAX_TOKENS="$2" 106 | shift 2 107 | ;; 108 | -d|--device) 109 | DEVICE="$2" 110 | case "$DEVICE" in 111 | "cuda" | "metal" | "cpu") 112 | ;; 113 | *) 114 | echo "Invalid value for --device. Please use 'cuda', 'cpu' or 'metal'." 115 | print_usage 116 | ;; 117 | esac 118 | if [ "$DEVICE" == "cuda" ]; then 119 | check_cuda 120 | else 121 | echo "Not supported for $DEVICE" 122 | exit 1 123 | fi 124 | shift 2 125 | ;; 126 | -n|--model_name) 127 | MODEL_NAME="$2" 128 | shift 2 129 | ;; 130 | -h|--help) 131 | print_usage 132 | ;; 133 | *) 134 | echo "Unknown option: $1" 135 | print_usage 136 | ;; 137 | esac 138 | done 139 | 140 | check_platform 141 | check_python 142 | setup "$MODEL_NAME" 143 | 144 | # Set default values if not provided 145 | PROMPT="${PROMPT:-"Write an essay about the transformer model architecture"}" 146 | REPETITIONS="${REPETITIONS:-10}" 147 | MAX_TOKENS="${MAX_TOKENS:-512}" 148 | DEVICE="${DEVICE:-'cuda'}" 149 | MODEL_NAME="${MODEL_NAME:-"llama"}" 150 | 151 | run_benchmarks "$PROMPT" "$REPETITIONS" "$MAX_TOKENS" "$DEVICE" "$MODEL_NAME" 152 | -------------------------------------------------------------------------------- /bench_autogptq/bench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ######################################################################################################## 4 | # Script: bench.sh 5 | # Description: This script runs benchmarks AutoGPTQ benchmark. 6 | # 7 | # Usage: ./bench.sh [OPTIONS] 8 | # OPTIONS: 9 | # -p, --prompt Prompt for benchmarks (default: 'Write an essay about the transformer model architecture') 10 | # -r, --repetitions Number of repetitions for benchmarks (default: 10) 11 | # -m, --max_tokens Maximum number of tokens for benchmarks (default: 512) 12 | # -d, --device Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda') 13 | # -n, --model_name The name of the model to benchmark (possible values: 'llama' for using Llama2, 'mistral' for using Mistral 7B v0.1) 14 | # -lf, --log_file Logging file name. 15 | # -h, --help Show this help message 16 | ######################################################################################################## 17 | 18 | set -euo pipefail 19 | 20 | print_usage() { 21 | echo "Usage: $0 [OPTIONS]" 22 | echo "OPTIONS:" 23 | echo " -p, --prompt Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')" 24 | echo " -r, --repetitions Number of repetitions for benchmarks (default: 10)" 25 | echo " -m, --max_tokens Maximum number of tokens for benchmarks (default: 512)" 26 | echo " -d, --device Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')" 27 | echo " -n, --model_name The name of the model to benchmark (possible values: 'llama' for using Llama2, 'mistral' for using Mistral 7B v0.1)" 28 | echo " -lf, --log_file Logging file name." 29 | echo " -h, --help Show this help message" 30 | exit 1 31 | } 32 | 33 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 34 | 35 | 36 | check_cuda() { 37 | if command -v nvcc &> /dev/null 38 | then 39 | echo -e "\nUsing CUDA" 40 | nvcc --version 41 | else 42 | echo -e "\nCUDA is not available." 43 | exit 1 44 | fi 45 | } 46 | 47 | check_platform() { 48 | local platform 49 | platform=$(uname -s) 50 | if [[ "$platform" == "Linux" ]]; then 51 | echo "Running on Linux." 52 | elif [[ "$platform" == "Darwin" ]]; then 53 | echo "Running on Mac OS." 54 | else 55 | echo "Unknown platform." 56 | exit 1 57 | fi 58 | } 59 | 60 | check_python() { 61 | if command -v python &> /dev/null; then 62 | PYTHON_CMD="python" 63 | elif command -v python3 &> /dev/null; then 64 | PYTHON_CMD="python3" 65 | else 66 | echo "Python is not installed." 67 | exit 1 68 | fi 69 | } 70 | 71 | 72 | setup() { 73 | local MODEL_NAME="${1:-llama}" 74 | echo -e "\nSetting up with $SCRIPT_DIR/setup.sh..." 75 | bash "$SCRIPT_DIR/setup.sh" "$MODEL_NAME" 76 | } 77 | 78 | run_benchmarks() { 79 | local PROMPT="$1" 80 | local REPETITIONS="$2" 81 | local MAX_TOKENS="$3" 82 | local DEVICE="$4" 83 | local MODEL_NAME="$5" 84 | 85 | # shellcheck disable=SC1091 86 | source "$SCRIPT_DIR/venv/bin/activate" 87 | "$PYTHON_CMD" "$SCRIPT_DIR"/bench.py \ 88 | --prompt "$PROMPT" \ 89 | --repetitions "$REPETITIONS" \ 90 | --max_tokens "$MAX_TOKENS" \ 91 | --model_name "$MODEL_NAME" \ 92 | --device "$DEVICE" 93 | } 94 | 95 | while [ "$#" -gt 0 ]; do 96 | case "$1" in 97 | -p|--prompt) 98 | PROMPT="$2" 99 | shift 2 100 | ;; 101 | -r|--repetitions) 102 | REPETITIONS="$2" 103 | shift 2 104 | ;; 105 | -m|--max_tokens) 106 | MAX_TOKENS="$2" 107 | shift 2 108 | ;; 109 | -d|--device) 110 | DEVICE="$2" 111 | case "$DEVICE" in 112 | "cuda" | "metal" | "cpu") 113 | ;; 114 | *) 115 | echo "Invalid value for --device. Please use 'cuda', 'cpu' or 'metal'." 116 | print_usage 117 | ;; 118 | esac 119 | if [ "$DEVICE" == "cuda" ]; then 120 | check_cuda 121 | else 122 | echo "Not supported for $DEVICE" 123 | exit 1 124 | fi 125 | shift 2 126 | ;; 127 | -n|--model_name) 128 | MODEL_NAME="$2" 129 | shift 2 130 | ;; 131 | -h|--help) 132 | print_usage 133 | ;; 134 | *) 135 | echo "Unknown option: $1" 136 | print_usage 137 | ;; 138 | esac 139 | done 140 | 141 | check_platform 142 | check_python 143 | setup "$MODEL_NAME" 144 | 145 | # Set default values if not provided 146 | PROMPT="${PROMPT:-"Write an essay about the transformer model architecture"}" 147 | REPETITIONS="${REPETITIONS:-10}" 148 | MAX_TOKENS="${MAX_TOKENS:-512}" 149 | DEVICE="${DEVICE:-'cuda'}" 150 | MODEL_NAME="${MODEL_NAME:-"llama"}" 151 | 152 | run_benchmarks "$PROMPT" "$REPETITIONS" "$MAX_TOKENS" "$DEVICE" "$MODEL_NAME" 153 | -------------------------------------------------------------------------------- /bench_autoawq/bench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ######################################################################################################## 4 | # Script: bench.sh 5 | # Description: This script runs benchmarks AutoAWQ llama benchmark. 6 | # 7 | # Usage: ./bench.sh [OPTIONS] 8 | # OPTIONS: 9 | # -p, --prompt Prompt for benchmarks (default: 'Write an essay about the transformer model architecture') 10 | # -r, --repetitions Number of repetitions for benchmarks (default: 10) 11 | # -m, --max_tokens Maximum number of tokens for benchmarks (default: 512) 12 | # -d, --device Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda') 13 | # -n, --model_name The name of the model to benchmark (possible values: 'llama' for using Llama2, 'mistral' for using Mistral 7B v0.1) 14 | # -lf, --log_file Logging file name. 15 | # -h, --help Show this help message 16 | ######################################################################################################## 17 | 18 | set -euo pipefail 19 | 20 | print_usage() { 21 | echo "Usage: $0 [OPTIONS]" 22 | echo "OPTIONS:" 23 | echo " -p, --prompt Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')" 24 | echo " -r, --repetitions Number of repetitions for benchmarks (default: 10)" 25 | echo " -m, --max_tokens Maximum number of tokens for benchmarks (default: 512)" 26 | echo " -d, --device Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')" 27 | echo " -n, --model_name The name of the model to benchmark (possible values: 'llama' for using Llama2, 'mistral' for using Mistral 7B v0.1)" 28 | echo " -lf, --log_file Logging file name." 29 | echo " -h, --help Show this help message" 30 | exit 1 31 | } 32 | 33 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 34 | 35 | 36 | check_cuda() { 37 | if command -v nvcc &> /dev/null 38 | then 39 | echo -e "\nUsing CUDA" 40 | nvcc --version 41 | else 42 | echo -e "\nCUDA is not available." 43 | exit 1 44 | fi 45 | } 46 | 47 | check_platform() { 48 | local platform 49 | platform=$(uname -s) 50 | if [[ "$platform" == "Linux" ]]; then 51 | echo "Running on Linux." 52 | elif [[ "$platform" == "Darwin" ]]; then 53 | echo "Running on Mac OS." 54 | else 55 | echo "Unknown platform." 56 | exit 1 57 | fi 58 | } 59 | 60 | check_python() { 61 | if command -v python &> /dev/null; then 62 | PYTHON_CMD="python" 63 | elif command -v python3 &> /dev/null; then 64 | PYTHON_CMD="python3" 65 | else 66 | echo "Python is not installed." 67 | exit 1 68 | fi 69 | } 70 | 71 | 72 | setup() { 73 | local MODEL_NAME="${1:-llama}" 74 | echo -e "\nSetting up with $SCRIPT_DIR/setup.sh..." 75 | bash "$SCRIPT_DIR/setup.sh" "$MODEL_NAME" 76 | } 77 | 78 | run_benchmarks() { 79 | local PROMPT="$1" 80 | local REPETITIONS="$2" 81 | local MAX_TOKENS="$3" 82 | local DEVICE="$4" 83 | local MODEL_NAME="$5" 84 | 85 | # shellcheck disable=SC1091 86 | source "$SCRIPT_DIR/venv/bin/activate" 87 | "$PYTHON_CMD" "$SCRIPT_DIR"/bench.py \ 88 | --prompt "$PROMPT" \ 89 | --repetitions "$REPETITIONS" \ 90 | --max_tokens "$MAX_TOKENS" \ 91 | --model_name "$MODEL_NAME" \ 92 | --device "$DEVICE" 93 | } 94 | 95 | while [ "$#" -gt 0 ]; do 96 | case "$1" in 97 | -p|--prompt) 98 | PROMPT="$2" 99 | shift 2 100 | ;; 101 | -r|--repetitions) 102 | REPETITIONS="$2" 103 | shift 2 104 | ;; 105 | -m|--max_tokens) 106 | MAX_TOKENS="$2" 107 | shift 2 108 | ;; 109 | -d|--device) 110 | DEVICE="$2" 111 | case "$DEVICE" in 112 | "cuda" | "metal" | "cpu") 113 | ;; 114 | *) 115 | echo "Invalid value for --device. Please use 'cuda', 'cpu' or 'metal'." 116 | print_usage 117 | ;; 118 | esac 119 | if [ "$DEVICE" == "cuda" ]; then 120 | check_cuda 121 | else 122 | echo "Not supported for $DEVICE" 123 | exit 1 124 | fi 125 | shift 2 126 | ;; 127 | -n|--model_name) 128 | MODEL_NAME="$2" 129 | shift 2 130 | ;; 131 | -h|--help) 132 | print_usage 133 | ;; 134 | *) 135 | echo "Unknown option: $1" 136 | print_usage 137 | ;; 138 | esac 139 | done 140 | 141 | check_platform 142 | check_python 143 | setup "$MODEL_NAME" 144 | 145 | # Set default values if not provided 146 | PROMPT="${PROMPT:-"Write an essay about the transformer model architecture"}" 147 | REPETITIONS="${REPETITIONS:-10}" 148 | MAX_TOKENS="${MAX_TOKENS:-512}" 149 | DEVICE="${DEVICE:-'cuda'}" 150 | MODEL_NAME="${MODEL_NAME:-"llama"}" 151 | 152 | run_benchmarks "$PROMPT" "$REPETITIONS" "$MAX_TOKENS" "$DEVICE" "$MODEL_NAME" 153 | -------------------------------------------------------------------------------- /bench_lightning/bench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ######################################################################################################## 4 | # Script: bench.sh 5 | # Description: This script runs benchmarks PyTorch Lightning benchmark. 6 | # 7 | # Usage: ./bench.sh [OPTIONS] 8 | # OPTIONS: 9 | # -p, --prompt Prompt for benchmarks (default: 'Write an essay about the transformer model architecture') 10 | # -r, --repetitions Number of repetitions for benchmarks (default: 10) 11 | # -m, --max_tokens Maximum number of tokens for benchmarks (default: 512) 12 | # -d, --device Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda') 13 | # -n, --model_name The name of the model to benchmark (possible values: 'llama' for using Llama2, 'mistral' for using Mistral 7B v0.1) 14 | # -lf, --log_file Logging file name. 15 | # -h, --help Show this help message 16 | ######################################################################################################## 17 | 18 | set -euo pipefail 19 | 20 | print_usage() { 21 | echo "Usage: $0 [OPTIONS]" 22 | echo "OPTIONS:" 23 | echo " -p, --prompt Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')" 24 | echo " -r, --repetitions Number of repetitions for benchmarks (default: 10)" 25 | echo " -m, --max_tokens Maximum number of tokens for benchmarks (default: 512)" 26 | echo " -d, --device Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')" 27 | echo " -n, --model_name The name of the model to benchmark (possible values: 'llama' for using Llama2, 'mistral' for using Mistral 7B v0.1)" 28 | echo " -lf, --log_file Logging file name." 29 | echo " -h, --help Show this help message" 30 | exit 1 31 | } 32 | 33 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 34 | 35 | check_cuda() { 36 | if command -v nvcc &> /dev/null 37 | then 38 | echo -e "\nUsing CUDA" 39 | nvcc --version 40 | else 41 | echo -e "\nCUDA is not available." 42 | exit 1 43 | fi 44 | } 45 | 46 | check_platform() { 47 | local platform 48 | platform=$(uname -s) 49 | if [[ "$platform" == "Linux" ]]; then 50 | echo "Running on Linux." 51 | elif [[ "$platform" == "Darwin" ]]; then 52 | echo "Running on Mac OS." 53 | else 54 | echo "Unknown platform." 55 | exit 1 56 | fi 57 | } 58 | 59 | check_python() { 60 | if command -v python &> /dev/null; then 61 | PYTHON_CMD="python" 62 | elif command -v python3 &> /dev/null; then 63 | PYTHON_CMD="python3" 64 | else 65 | echo "Python is not installed." 66 | exit 1 67 | fi 68 | } 69 | 70 | 71 | setup() { 72 | local MODEL_NAME="${1:-llama}" 73 | echo -e "\nSetting up with $SCRIPT_DIR/setup.sh..." 74 | bash "$SCRIPT_DIR/setup.sh" "$MODEL_NAME" 75 | } 76 | 77 | run_benchmarks() { 78 | local PROMPT="$1" 79 | local REPETITIONS="$2" 80 | local MAX_TOKENS="$3" 81 | local DEVICE="$4" 82 | local MODEL_NAME="$5" 83 | 84 | # shellcheck disable=SC1091 85 | source "$SCRIPT_DIR/venv/bin/activate" 86 | "$PYTHON_CMD" "$SCRIPT_DIR"/bench.py \ 87 | --prompt "$PROMPT" \ 88 | --repetitions "$REPETITIONS" \ 89 | --max_tokens "$MAX_TOKENS" \ 90 | --model_name "$MODEL_NAME" \ 91 | --device "$DEVICE" 92 | } 93 | 94 | while [ "$#" -gt 0 ]; do 95 | case "$1" in 96 | -p|--prompt) 97 | PROMPT="$2" 98 | shift 2 99 | ;; 100 | -r|--repetitions) 101 | REPETITIONS="$2" 102 | shift 2 103 | ;; 104 | -m|--max_tokens) 105 | MAX_TOKENS="$2" 106 | shift 2 107 | ;; 108 | -d|--device) 109 | DEVICE="$2" 110 | case "$DEVICE" in 111 | "cuda" | "metal" | "cpu") 112 | ;; 113 | *) 114 | echo "Invalid value for --device. Please use 'cuda', 'cpu' or 'metal'." 115 | print_usage 116 | ;; 117 | esac 118 | if [ "$DEVICE" == "cuda" ]; then 119 | check_cuda 120 | else 121 | echo "Not supported for $DEVICE" 122 | exit 1 123 | fi 124 | shift 2 125 | ;; 126 | -n|--model_name) 127 | MODEL_NAME="$2" 128 | shift 2 129 | ;; 130 | -h|--help) 131 | print_usage 132 | ;; 133 | *) 134 | echo "Unknown option: $1" 135 | print_usage 136 | ;; 137 | esac 138 | done 139 | 140 | check_platform 141 | check_python 142 | setup "$MODEL_NAME" 143 | 144 | # Set default values if not provided 145 | PROMPT="${PROMPT:-"Write an essay about the transformer model architecture"}" 146 | REPETITIONS="${REPETITIONS:-10}" 147 | MAX_TOKENS="${MAX_TOKENS:-512}" 148 | DEVICE="${DEVICE:-'cuda'}" 149 | MODEL_NAME="${MODEL_NAME:-"llama"}" 150 | 151 | run_benchmarks "$PROMPT" "$REPETITIONS" "$MAX_TOKENS" "$DEVICE" "$MODEL_NAME" 152 | -------------------------------------------------------------------------------- /bench_vllm/bench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ######################################################################################################## 4 | # Script: bench.sh 5 | # Description: This script runs benchmarks vLLM benchmark. 6 | # 7 | # Usage: ./bench.sh [OPTIONS] 8 | # OPTIONS: 9 | # -p, --prompt Prompt for benchmarks (default: 'Write an essay about the transformer model architecture') 10 | # -r, --repetitions Number of repetitions for benchmarks (default: 10) 11 | # -m, --max_tokens Maximum number of tokens for benchmarks (default: 512) 12 | # -d, --device Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda') 13 | # -n, --model_name The name of the model to benchmark (possible values: 'llama' for using Llama2, 'mistral' for using Mistral 7B v0.1) 14 | # -lf, --log_file Logging file name. 15 | # -h, --help Show this help message 16 | ######################################################################################################## 17 | 18 | set -euo pipefail 19 | 20 | print_usage() { 21 | echo "Usage: $0 [OPTIONS]" 22 | echo "OPTIONS:" 23 | echo " -p, --prompt Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')" 24 | echo " -r, --repetitions Number of repetitions for benchmarks (default: 10)" 25 | echo " -m, --max_tokens Maximum number of tokens for benchmarks (default: 512)" 26 | echo " -d, --device Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')" 27 | echo " -n, --model_name The name of the model to benchmark (possible values: 'llama' for using Llama2, 'mistral' for using Mistral 7B v0.1)" 28 | echo " -lf, --log_file Logging file name." 29 | echo " -h, --help Show this help message" 30 | exit 1 31 | } 32 | 33 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 34 | 35 | check_cuda() { 36 | if command -v nvcc &> /dev/null 37 | then 38 | echo -e "\nUsing CUDA" 39 | nvcc --version 40 | else 41 | echo -e "\nCUDA is not available." 42 | exit 1 43 | fi 44 | } 45 | 46 | check_platform() { 47 | local platform 48 | platform=$(uname -s) 49 | if [[ "$platform" == "Linux" ]]; then 50 | echo "Running on Linux." 51 | elif [[ "$platform" == "Darwin" ]]; then 52 | echo "Running on Mac OS." 53 | else 54 | echo "Unknown platform." 55 | exit 1 56 | fi 57 | } 58 | 59 | check_python() { 60 | if command -v python &> /dev/null; then 61 | PYTHON_CMD="python" 62 | elif command -v python3 &> /dev/null; then 63 | PYTHON_CMD="python3" 64 | else 65 | echo "Python is not installed." 66 | exit 1 67 | fi 68 | } 69 | 70 | 71 | setup() { 72 | local DEVICE="$1" 73 | local MODEL_NAME="${2:-llama}" 74 | echo -e "\nSetting up with $SCRIPT_DIR/setup.sh..." 75 | bash "$SCRIPT_DIR/setup.sh" "$DEVICE" "$MODEL_NAME" 76 | } 77 | 78 | run_benchmarks() { 79 | local PROMPT="$1" 80 | local REPETITIONS="$2" 81 | local MAX_TOKENS="$3" 82 | local DEVICE="$4" 83 | local MODEL_NAME="$5" 84 | 85 | # shellcheck disable=SC1091 86 | source "$SCRIPT_DIR/venv/bin/activate" 87 | "$PYTHON_CMD" "$SCRIPT_DIR"/bench.py \ 88 | --prompt "$PROMPT" \ 89 | --repetitions "$REPETITIONS" \ 90 | --max_tokens "$MAX_TOKENS" \ 91 | --model_name "$MODEL_NAME" \ 92 | --device "$DEVICE" 93 | } 94 | 95 | # Parse command-line arguments 96 | while [ "$#" -gt 0 ]; do 97 | case "$1" in 98 | -p|--prompt) 99 | PROMPT="$2" 100 | shift 2 101 | ;; 102 | -r|--repetitions) 103 | REPETITIONS="$2" 104 | shift 2 105 | ;; 106 | -m|--max_tokens) 107 | MAX_TOKENS="$2" 108 | shift 2 109 | ;; 110 | -d|--device) 111 | DEVICE="$2" 112 | case "$DEVICE" in 113 | "cuda" | "metal" | "cpu") 114 | ;; 115 | *) 116 | echo "Invalid value for --device. Please use 'cuda', 'cpu' or 'metal'." 117 | print_usage 118 | ;; 119 | esac 120 | if [ "$DEVICE" == "cuda" ]; then 121 | check_cuda 122 | else 123 | echo "Not supported for $DEVICE" 124 | exit 1 125 | fi 126 | shift 2 127 | ;; 128 | -n|--model_name) 129 | MODEL_NAME="$2" 130 | shift 2 131 | ;; 132 | -h|--help) 133 | print_usage 134 | ;; 135 | *) 136 | echo "Unknown option: $1" 137 | print_usage 138 | ;; 139 | esac 140 | done 141 | 142 | check_platform 143 | check_python 144 | 145 | # Set default values if not provided 146 | PROMPT="${PROMPT:-"Write an essay about the transformer model architecture"}" 147 | REPETITIONS="${REPETITIONS:-10}" 148 | MAX_TOKENS="${MAX_TOKENS:-512}" 149 | DEVICE="${DEVICE:-'cuda'}" 150 | MODEL_NAME="${MODEL_NAME:-"llama"}" 151 | 152 | setup "$DEVICE" "$MODEL_NAME" 153 | run_benchmarks "$PROMPT" "$REPETITIONS" "$MAX_TOKENS" "$DEVICE" "$MODEL_NAME" 154 | -------------------------------------------------------------------------------- /bench_vllm/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ################################################################################ 4 | # Script: setup.sh 5 | # Description: Automates the setup of a virtual environment and installs project 6 | # requirements. 7 | ################################################################################ 8 | 9 | set -euo pipefail 10 | 11 | CURRENT_DIR="$(pwd)" 12 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 13 | 14 | # Set default folder paths for AWQ weights 15 | LLAMA2_AWQ_WEIGHTS_FOLDER="$CURRENT_DIR/models/llama-2-7b-chat-autoawq" 16 | MISTRAL_AWQ_WEIGHTS_FOLDER="$CURRENT_DIR/models/mistral-7b-v0.1-instruct-autoawq" 17 | 18 | 19 | check_python() { 20 | if command -v python &> /dev/null; then 21 | PYTHON_CMD="python" 22 | elif command -v python3 &> /dev/null; then 23 | PYTHON_CMD="python3" 24 | else 25 | echo "Python is not installed." 26 | exit 1 27 | fi 28 | } 29 | 30 | check_python 31 | 32 | install_vllm_cuda() { 33 | CUDA_VERSION=$(nvcc --version | grep "release" | sed -n 's/.*release \(.*\),.*/\1/p') 34 | 35 | if [ -z "$CUDA_VERSION" ]; then 36 | echo "CUDA is not installed or not found." 37 | exit 1 38 | fi 39 | 40 | CUDA_MAJOR=$(echo "$CUDA_VERSION" | cut -d. -f1) 41 | CUDA_MINOR=$(echo "$CUDA_VERSION" | cut -d. -f2) 42 | 43 | if [ "$CUDA_MAJOR" -ge 12 ] || { [ "$CUDA_MAJOR" -eq 12 ] && [ "$CUDA_MINOR" -ge 0 ]; }; then 44 | echo "Detected CUDA version >= 12.2" 45 | "$PYTHON_CMD" -m pip install vllm==0.4.0 transformers==4.39.2 46 | else 47 | echo "Detected CUDA version < 12.2" 48 | PY_VERSION=$(get_python_version) 49 | if [ -z "$PY_VERSION" ]; then 50 | echo "Python version not found." 51 | exit 1 52 | fi 53 | echo "Installing vllm for CUDA 11.8 with Python version: $PY_VERSION" 54 | # Download vllm for CUDA 11.8 and specified Python version 55 | "$PYTHON_CMD" -m pip install https://github.com/vllm-project/vllm/releases/download/v0.2.2/vllm-0.2.2+cu118-"$PY_VERSION"-"$PY_VERSION"-manylinux1_x86_64.whl 56 | "$PYTHON_CMD" -m pip install torch --upgrade --index-url https://download.pytorch.org/whl/cu118 57 | "$PYTHON_CMD" -m pip install huggingface-cli==0.1 transformers==4.39.2 58 | fi 59 | } 60 | 61 | get_python_version() { 62 | # Fetch Python version 63 | PY_VER=$("$PYTHON_CMD" -c 'import sys; print(".".join(map(str, sys.version_info[:2])))') 64 | 65 | case $PY_VER in 66 | 3.10) echo "cp310";; 67 | 3.8) echo "cp38";; 68 | 3.9) echo "cp39";; 69 | 3.11) echo "cp311";; 70 | *) echo "Unknown Python version"; exit 1;; 71 | esac 72 | } 73 | 74 | 75 | install_device_specific_vllm() { 76 | local DEVICE="$1" 77 | 78 | if [ "$#" -ne 1 ]; then 79 | echo "Usage: $0 " 80 | exit 1 81 | fi 82 | 83 | case "$DEVICE" in 84 | cuda) 85 | echo "Installing VLLM for CUDA." 86 | install_vllm_cuda 87 | ;; 88 | metal) 89 | echo "VLLM for metal is not supported yet." 90 | echo "For more information, checkout this issue: https://github.com/vllm-project/vllm/issues/1441" 91 | return 1 92 | ;; 93 | cpu) 94 | echo "VLLM for CPU is not supported yet." 95 | echo "For more information, checkout this issue: https://github.com/vllm-project/vllm/issues/176" 96 | ;; 97 | *) 98 | echo "Unsupported DEVICE: $DEVICE" 99 | return 1 100 | ;; 101 | esac 102 | } 103 | 104 | download_awq_weights() { 105 | local MODEL_NAME="$1" 106 | 107 | # Set download directory based on MODEL_NAME 108 | if [ "$MODEL_NAME" = "llama" ]; then 109 | DOWNLOAD_DIR="$LLAMA2_AWQ_WEIGHTS_FOLDER" 110 | MODEL_IDENTIFIER="TheBloke/Llama-2-7B-Chat-AWQ" 111 | elif [ "$MODEL_NAME" = "mistral" ]; then 112 | DOWNLOAD_DIR="$MISTRAL_AWQ_WEIGHTS_FOLDER" 113 | MODEL_IDENTIFIER="TheBloke/Mistral-7B-Instruct-v0.1-AWQ" 114 | else 115 | echo "Invalid MODEL_NAME. Supported values: 'llama', 'mistral'" 116 | exit 1 117 | fi 118 | 119 | # Check if weights folder exists 120 | echo "$DOWNLOAD_DIR" 121 | 122 | if [ ! -d "$DOWNLOAD_DIR" ]; then 123 | # Download weights using huggingface-cli 124 | echo "Downloading weights to $DOWNLOAD_DIR..." 125 | huggingface-cli download "$MODEL_IDENTIFIER" --local-dir "$DOWNLOAD_DIR" --exclude "*.git*" "*.md" "Notice" "LICENSE" 126 | else 127 | echo "Weights already downloaded" 128 | fi 129 | } 130 | 131 | 132 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" 133 | VENV_DIR="$SCRIPT_DIR/venv" 134 | 135 | DEVICE="$1" 136 | MODEL_NAME="$2" 137 | 138 | 139 | # Build and activate the virtual environment. 140 | 141 | if [ ! -d "$VENV_DIR" ]; then 142 | "$PYTHON_CMD" -m venv "$VENV_DIR" 143 | echo "Virtual environment '$VENV_DIR' created." 144 | # shellcheck disable=SC1091 145 | source "$VENV_DIR/bin/activate" 146 | "$PYTHON_CMD" -m pip install --upgrade pip > /dev/null 147 | install_device_specific_vllm "$DEVICE" 148 | else 149 | # shellcheck disable=SC1091 150 | source "$VENV_DIR/bin/activate" 151 | fi 152 | 153 | download_awq_weights "$MODEL_NAME" 154 | -------------------------------------------------------------------------------- /bench_llamacpp/bench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ######################################################################################################## 4 | # Script: bench.sh 5 | # Description: This script runs benchmarks LlamaCPP llama benchmark. 6 | # 7 | # Usage: ./bench.sh [OPTIONS] 8 | # OPTIONS: 9 | # -p, --prompt Prompt for benchmarks (default: 'Write an essay about the transformer model architecture') 10 | # -r, --repetitions Number of repetitions for benchmarks (default: 10) 11 | # -m, --max_tokens Maximum number of tokens for benchmarks (default: 512) 12 | # -d, --device Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda') 13 | # -n, --model_name The name of the model to benchmark (possible values: 'llama' for using Llama2, 'mistral' for using Mistral 7B v0.1) 14 | # -lf, --log_file Logging file name. 15 | # -h, --help Show this help message 16 | ######################################################################################################## 17 | 18 | set -euo pipefail 19 | 20 | print_usage() { 21 | echo "Usage: $0 [OPTIONS]" 22 | echo "OPTIONS:" 23 | echo " -p, --prompt Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')" 24 | echo " -r, --repetitions Number of repetitions for benchmarks (default: 10)" 25 | echo " -m, --max_tokens Maximum number of tokens for benchmarks (default: 512)" 26 | echo " -d, --device Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')" 27 | echo " -n, --model_name The name of the model to benchmark (possible values: 'llama' for using Llama2, 'mistral' for using Mistral 7B v0.1)" 28 | echo " -lf, --log_file Logging file name." 29 | echo " -h, --help Show this help message" 30 | exit 1 31 | } 32 | 33 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 34 | 35 | check_cuda() { 36 | if command -v nvcc &> /dev/null 37 | then 38 | echo -e "\nUsing CUDA" 39 | nvcc --version 40 | else 41 | echo -e "\nCUDA is not available." 42 | exit 1 43 | fi 44 | } 45 | 46 | check_platform() { 47 | local platform 48 | platform=$(uname -s) 49 | if [[ "$platform" == "Linux" ]]; then 50 | echo "Running on Linux." 51 | elif [[ "$platform" == "Darwin" ]]; then 52 | echo "Running on Mac OS." 53 | else 54 | echo "Unknown platform." 55 | exit 1 56 | fi 57 | } 58 | 59 | check_python() { 60 | if command -v python &> /dev/null; then 61 | PYTHON_CMD="python" 62 | elif command -v python3 &> /dev/null; then 63 | PYTHON_CMD="python3" 64 | else 65 | echo "Python is not installed." 66 | exit 1 67 | fi 68 | } 69 | 70 | setup() { 71 | local DEVICE="$1" 72 | local MODEL_NAME="$2" 73 | echo -e "\nSetting up with $SCRIPT_DIR/setup.sh..." 74 | bash "$SCRIPT_DIR/setup.sh" "$DEVICE" "$MODEL_NAME" 75 | } 76 | 77 | run_benchmarks() { 78 | local PROMPT="$1" 79 | local REPETITIONS="$2" 80 | local MAX_TOKENS="$3" 81 | local DEVICE="$4" 82 | local MODEL_NAME="$5" 83 | 84 | if [ "$DEVICE" == "cuda" ] || [ "$DEVICE" == "metal" ]; then 85 | export LLAMA_CPP_LIB=$SCRIPT_DIR/venv/libllama_$DEVICE.so 86 | echo "LLAMA_CPP_LIB=$LLAMA_CPP_LIB" 87 | fi 88 | 89 | # shellcheck disable=SC1091 90 | source "$SCRIPT_DIR/venv/bin/activate" 91 | "$PYTHON_CMD" "$SCRIPT_DIR"/bench.py \ 92 | --prompt "$PROMPT" \ 93 | --repetitions "$REPETITIONS" \ 94 | --max_tokens "$MAX_TOKENS" \ 95 | --model_name "$MODEL_NAME" \ 96 | --device "$DEVICE" 97 | } 98 | 99 | while [ "$#" -gt 0 ]; do 100 | case "$1" in 101 | -p|--prompt) 102 | PROMPT="$2" 103 | shift 2 104 | ;; 105 | -r|--repetitions) 106 | REPETITIONS="$2" 107 | shift 2 108 | ;; 109 | -m|--max_tokens) 110 | MAX_TOKENS="$2" 111 | shift 2 112 | ;; 113 | -d|--device) 114 | DEVICE="$2" 115 | case "$DEVICE" in 116 | "cuda" | "metal" | "cpu") 117 | ;; 118 | *) 119 | echo "Invalid value for --device. Please use 'cuda', 'cpu' or 'metal'." 120 | print_usage 121 | ;; 122 | esac 123 | if [ "$DEVICE" == "cuda" ]; then 124 | check_cuda 125 | else 126 | echo "Not supported for $DEVICE" 127 | exit 1 128 | fi 129 | shift 2 130 | ;; 131 | -n|--model_name) 132 | MODEL_NAME="$2" 133 | shift 2 134 | ;; 135 | -h|--help) 136 | print_usage 137 | ;; 138 | *) 139 | echo "Unknown option: $1" 140 | print_usage 141 | ;; 142 | esac 143 | done 144 | 145 | check_platform 146 | check_python 147 | 148 | # Set default values if not provided 149 | PROMPT="${PROMPT:-"Write an essay about the transformer model architecture"}" 150 | REPETITIONS="${REPETITIONS:-10}" 151 | MAX_TOKENS="${MAX_TOKENS:-512}" 152 | DEVICE="${DEVICE:-'cuda'}" 153 | MODEL_NAME="${MODEL_NAME:-"llama"}" 154 | 155 | setup "$DEVICE" 156 | run_benchmarks "$PROMPT" "$REPETITIONS" "$MAX_TOKENS" "$DEVICE" "$MODEL_NAME" 157 | -------------------------------------------------------------------------------- /bench_autogptq/bench.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import sys 4 | 5 | import torch 6 | from auto_gptq import AutoGPTQForCausalLM 7 | from transformers import AutoTokenizer 8 | 9 | sys.path.append(os.getcwd()) 10 | 11 | from common.base import BaseBenchmarkClass # noqa 12 | from common.utils import launch_cli, make_report # noqa 13 | 14 | _MESSAGE = """ 15 | GPTQ adopts a mixed int4/fp16 quantization scheme where weights are quantized as int4 while activations remain 16 | in float16. During inference, weights are dequantized on the fly and the actual compute is performed in float16. 17 | """ 18 | 19 | 20 | class AutoGPTQBenchmark(BaseBenchmarkClass): 21 | def __init__( 22 | self, 23 | model_path: str, 24 | model_name: str, 25 | benchmark_name: str, 26 | precision: str, 27 | device: str, 28 | experiment_name: str, 29 | ) -> None: 30 | super().__init__( 31 | model_name=model_name, 32 | model_path=model_path, 33 | benchmark_name=benchmark_name, 34 | experiment_name=experiment_name, 35 | precision=precision, 36 | device=device, 37 | ) 38 | 39 | if model_name == "llama": 40 | self.tokenizer_folder = os.path.join( 41 | os.getcwd(), "models", "llama-2-7b-chat-hf" 42 | ) 43 | else: 44 | self.tokenizer_folder = os.path.join( 45 | os.getcwd(), "models", "mistral-7b-v0.1-instruct-hf" 46 | ) 47 | 48 | self.precision_map = {"float16": torch.float16, "float32": torch.float32} 49 | 50 | def load_model_and_tokenizer(self): 51 | device = "cuda:0" if self.device == "cuda" else self.device 52 | 53 | if self.model_name == "llama": 54 | if self.precision == "float16": 55 | use_marlin = True 56 | else: 57 | use_marlin = False 58 | else: 59 | use_marlin = False 60 | 61 | self.model = AutoGPTQForCausalLM.from_quantized( 62 | self.model_path, 63 | device=device, 64 | use_marlin=use_marlin, 65 | torch_dtype=self.precision_map[self.precision], 66 | ) 67 | self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_folder) 68 | return self 69 | 70 | def preprocess(self, prompt: str, chat_mode: bool = True, for_benchmarks=True): 71 | if chat_mode: 72 | template = self.get_chat_template_with_instruction( 73 | prompt=prompt, for_benchmarks=for_benchmarks 74 | ) 75 | prompt = self.tokenizer.apply_chat_template(template, tokenize=False) 76 | 77 | tokenized_input = self.tokenizer.encode(text=prompt) 78 | tensor = self.tokenizer.encode(prompt, return_tensors="pt").to(self.device) 79 | 80 | return { 81 | "prompt": prompt, 82 | "input_tokens": tokenized_input, 83 | "tensor": tensor, 84 | "num_input_tokens": len(tokenized_input), 85 | } 86 | 87 | def run_model(self, inputs: dict, max_tokens: int, temperature: float) -> dict: 88 | tensor = inputs["tensor"] 89 | num_input_tokens = inputs["num_input_tokens"] 90 | 91 | output = ( 92 | self.model.generate( 93 | input_ids=tensor, 94 | max_new_tokens=max_tokens, 95 | temperature=temperature, 96 | do_sample=True, 97 | ) 98 | .detach() 99 | .tolist()[0] 100 | ) 101 | 102 | output_tokens = ( 103 | output[num_input_tokens:] if len(output) > num_input_tokens else output 104 | ) 105 | return {"output_tokens": output_tokens, "num_output_tokens": len(output_tokens)} 106 | 107 | def postprocess(self, output: dict) -> str: 108 | output_tokens = output["output_tokens"] 109 | return self.tokenizer.decode(output_tokens, skip_special_tokens=True) 110 | 111 | def on_exit(self): 112 | if self.device == "cuda:0": 113 | del self.model 114 | torch.cuda.synchronize() 115 | else: 116 | del self.model 117 | 118 | 119 | if __name__ == "__main__": 120 | parser = launch_cli(description="AutoGPTQ Benchmark.") 121 | args = parser.parse_args() 122 | 123 | model_folder = os.path.join(os.getcwd(), "models") 124 | model_name = ( 125 | f"{args.model_name}-2-7b-chat-autogptq" 126 | if args.model_name == "llama" 127 | else f"{args.model_name}-7b-v0.1-instruct-autogptq" 128 | ) 129 | logging.info(_MESSAGE) 130 | 131 | runner_dict = { 132 | "cuda": [ 133 | { 134 | "precision": "float16", 135 | "model_path": os.path.join(model_folder, model_name), 136 | }, 137 | { 138 | "precision": "float32", 139 | "model_path": os.path.join(model_folder, model_name), 140 | }, 141 | ] 142 | } 143 | 144 | if args.device == "cpu": 145 | logging.info("Skipping running model on int4 on CPU, not implemented for Half") 146 | pass 147 | else: 148 | make_report( 149 | args=args, 150 | benchmark_class=AutoGPTQBenchmark, 151 | runner_dict=runner_dict, 152 | benchmark_name="AutoGPTQ", 153 | is_bench_pytorch=False, 154 | ) 155 | -------------------------------------------------------------------------------- /bench_onnxruntime/bench.py: -------------------------------------------------------------------------------- 1 | import gc 2 | import os 3 | import sys 4 | import time 5 | 6 | import torch 7 | from onnxruntime import InferenceSession 8 | from optimum.onnxruntime import ORTModelForCausalLM 9 | from transformers import AutoConfig, AutoTokenizer 10 | 11 | sys.path.append("/mnt") 12 | sys.path.append("/mnt/benchmarks/") 13 | 14 | from common.base import BaseBenchmarkClass # noqa 15 | from common.utils import launch_cli, make_report # noqa 16 | 17 | 18 | class ONNXOptimumBenchmark(BaseBenchmarkClass): 19 | def __init__( 20 | self, 21 | model_path: str, 22 | model_name: str, 23 | benchmark_name: str, 24 | precision: str, 25 | device: str, 26 | experiment_name: str, 27 | ) -> None: 28 | assert precision in ["float32", "float16"], ValueError( 29 | "Supported precision: 'float32' and 'float16'" 30 | ) 31 | assert device in ["cuda"], ValueError( 32 | "Current implement is only supported for device = 'cuda'" 33 | ) 34 | super().__init__( 35 | model_name=model_name, 36 | model_path=model_path, 37 | benchmark_name=benchmark_name, 38 | experiment_name=experiment_name, 39 | precision=precision, 40 | device=device, 41 | root_folder="/mnt/benchmarks", 42 | ) 43 | 44 | if model_name == "llama": 45 | self.tokenizer_folder = os.path.join( 46 | self.root_folder, "models", "llama-2-7b-chat-hf" 47 | ) 48 | else: 49 | self.tokenizer_folder = os.path.join( 50 | self.root_folder, "models", "mistral-7b-v0.1-instruct-hf" 51 | ) 52 | 53 | def load_model_and_tokenizer(self): 54 | start_time = time.perf_counter() 55 | onnx_path = os.path.join(self.model_path, "model.onnx") 56 | config = AutoConfig.from_pretrained(self.model_path) 57 | 58 | # load the session and the model 59 | self.session = InferenceSession(onnx_path, providers=["CUDAExecutionProvider"]) 60 | self.model = ORTModelForCausalLM( 61 | self.session, config, use_cache=False, use_io_binding=False 62 | ) 63 | self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_folder) 64 | delta = time.perf_counter() - start_time 65 | self.logger.info(f"Model Loading time took: {delta:.2f} seconds") 66 | return self 67 | 68 | def preprocess( 69 | self, prompt: str, chat_mode: bool = True, for_benchmarks: bool = True 70 | ): 71 | if chat_mode: 72 | template = self.get_chat_template_with_instruction( 73 | prompt=prompt, for_benchmarks=for_benchmarks 74 | ) 75 | prompt = self.tokenizer.apply_chat_template(template, tokenize=False) 76 | 77 | tokenized_input = self.tokenizer.encode(text=prompt) 78 | tensor = self.tokenizer(prompt, return_tensors="pt").to(self.device) 79 | return { 80 | "prompt": prompt, 81 | "input_tokens": tokenized_input, 82 | "tensor": tensor, 83 | "num_input_tokens": len(tokenized_input), 84 | } 85 | 86 | @torch.inference_mode(mode=True) 87 | def run_model(self, inputs: dict, max_tokens: int, temperature: float) -> dict: 88 | tensor = inputs["tensor"] 89 | num_input_tokens = inputs["num_input_tokens"] 90 | 91 | generated = self.model.generate( 92 | **tensor, 93 | do_sample=True, 94 | temperature=temperature, 95 | max_new_tokens=max_tokens, 96 | top_p=0.1, 97 | pad_token_id=self.tokenizer.eos_token_id, 98 | eos_token_id=self.tokenizer.eos_token_id, 99 | ) 100 | 101 | output_tokens = generated[0].detach().tolist()[num_input_tokens:] 102 | return {"output_tokens": output_tokens, "num_output_tokens": len(output_tokens)} 103 | 104 | def postprocess(self, output: dict) -> str: 105 | output_tokens = output["output_tokens"] 106 | output_text = self.tokenizer.decode(output_tokens, skip_special_tokens=True) 107 | return output_text 108 | 109 | def on_exit(self): 110 | if self.device in ["cuda", "cuda:0"]: 111 | del self.model 112 | del self.session 113 | torch.cuda.synchronize() 114 | gc.collect() 115 | else: 116 | del self.model 117 | del self.session 118 | 119 | 120 | if __name__ == "__main__": 121 | parser = launch_cli(description="ONNX HF-Optimum Benchmark.") 122 | args = parser.parse_args() 123 | 124 | model_folder = "/mnt/benchmarks/models" 125 | model_name = ( 126 | f"{args.model_name}-2-7b-chat-onnx" 127 | if args.model_name == "llama" 128 | else f"{args.model_name}-7b-v0.1-instruct-onnx" 129 | ) 130 | 131 | runner_dict = { 132 | "cuda": [ 133 | { 134 | "precision": "float32", 135 | "model_path": os.path.join(model_folder, model_name + "-float32"), 136 | }, 137 | { 138 | "precision": "float16", 139 | "model_path": os.path.join(model_folder, model_name + "-float16"), 140 | }, 141 | ] 142 | } 143 | 144 | make_report( 145 | args=args, 146 | benchmark_class=ONNXOptimumBenchmark, 147 | runner_dict=runner_dict, 148 | benchmark_name="ONNX-HF-Optimum", 149 | is_bench_pytorch=False, 150 | ) 151 | -------------------------------------------------------------------------------- /bench_tensorrtllm/bench.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import tensorrt_llm 5 | import torch 6 | from tensorrt_llm.runtime import ModelRunnerCpp 7 | from transformers import AutoTokenizer 8 | 9 | sys.path.append("/mnt") 10 | sys.path.append("/mnt/benchmarks/") 11 | 12 | from common.base import BaseBenchmarkClass # noqa 13 | from common.utils import launch_cli, make_report # noqa 14 | 15 | 16 | class TensorRTLLMBenchmark(BaseBenchmarkClass): 17 | def __init__( 18 | self, 19 | model_path: str, 20 | model_name: str, 21 | benchmark_name: str, 22 | precision: str, 23 | device: str, 24 | experiment_name: str, 25 | ) -> None: 26 | super().__init__( 27 | model_name=model_name, 28 | model_path=model_path, 29 | benchmark_name=benchmark_name, 30 | experiment_name=experiment_name, 31 | precision=precision, 32 | device=device, 33 | root_folder="/mnt/benchmarks", 34 | ) 35 | self.runtime_rank = tensorrt_llm.mpi_rank() 36 | if model_name == "llama": 37 | self.tokenizer_folder = os.path.join( 38 | self.root_folder, "models", "llama-2-7b-chat-hf" 39 | ) 40 | else: 41 | self.tokenizer_folder = os.path.join( 42 | self.root_folder, "models", "mistral-7b-v0.1-instruct-hf" 43 | ) 44 | 45 | def load_model_and_tokenizer(self): 46 | self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_folder) 47 | if self.tokenizer.pad_token_id is None: 48 | self.tokenizer.pad_token_id = self.tokenizer.eos_token_id 49 | self.pad_id = self.tokenizer.pad_token_id 50 | self.end_id = self.tokenizer.eos_token_id 51 | 52 | # load the runner kawargs 53 | runner_kwargs = dict( 54 | engine_dir=self.model_path, 55 | rank=self.runtime_rank, 56 | max_batch_size=1, 57 | max_input_len=512, 58 | max_output_len=512, 59 | max_beam_width=1, 60 | max_attention_window_size=None, 61 | sink_token_length=None, 62 | ) 63 | self.model = ModelRunnerCpp.from_dir(**runner_kwargs) 64 | return self 65 | 66 | def preprocess( 67 | self, prompt: str, chat_mode: bool = True, for_benchmarks: bool = True 68 | ): 69 | if chat_mode: 70 | template = self.get_chat_template_with_instruction( 71 | prompt=prompt, for_benchmarks=for_benchmarks 72 | ) 73 | prompt = self.tokenizer.apply_chat_template(template, tokenize=False) 74 | 75 | tokenized_input = self.tokenizer.encode(text=prompt) 76 | tensor = self.tokenizer.encode( 77 | prompt, return_tensors="pt", truncation=True 78 | ).squeeze(0) 79 | return { 80 | "prompt": prompt, 81 | "input_tokens": tokenized_input, 82 | "tensor": [tensor], 83 | "num_input_tokens": len(tokenized_input), 84 | } 85 | 86 | def run_model(self, inputs: dict, max_tokens: int, temperature: float) -> dict: 87 | tensor = inputs["tensor"] 88 | num_input_tokens = inputs["num_input_tokens"] 89 | 90 | with torch.no_grad(): 91 | output = self.model.generate( 92 | tensor, 93 | max_new_tokens=max_tokens, 94 | temperature=temperature, 95 | pad_id=self.pad_id, 96 | end_id=self.end_id, 97 | return_dict=True, 98 | ) 99 | 100 | output_ids = output["output_ids"] 101 | output_tokens = output_ids[0][0].detach().cpu().tolist()[num_input_tokens:] 102 | 103 | return { 104 | "output_tokens": output_tokens, 105 | "num_output_tokens": len(output_tokens), 106 | } 107 | 108 | def postprocess(self, output: dict) -> str: 109 | output_tokens = output["output_tokens"] 110 | output_text = self.tokenizer.decode(output_tokens, skip_special_tokens=True) 111 | return output_text 112 | 113 | def on_exit(self): 114 | del self.model 115 | torch.cuda.synchronize() 116 | 117 | 118 | if __name__ == "__main__": 119 | parser = launch_cli(description="Nvidia TRT-LLM Benchmark.") 120 | args = parser.parse_args() 121 | 122 | model_folder = "/mnt/benchmarks/models" 123 | model_name = ( 124 | f"{args.model_name}-2-7b-chat-trt" 125 | if args.model_name == "llama" 126 | else f"{args.model_name}-7b-v0.1-instruct-trt" 127 | ) 128 | 129 | runner_dict = { 130 | "cuda": [ 131 | { 132 | "precision": "float32", 133 | "model_path": os.path.join(model_folder, model_name + "-float32"), 134 | }, 135 | { 136 | "precision": "float16", 137 | "model_path": os.path.join(model_folder, model_name + "-float16"), 138 | }, 139 | { 140 | "precision": "int8", 141 | "model_path": os.path.join(model_folder, model_name + "-int8"), 142 | }, 143 | { 144 | "precision": "int4", 145 | "model_path": os.path.join(model_folder, model_name + "-int4"), 146 | }, 147 | ] 148 | } 149 | 150 | make_report( 151 | args=args, 152 | benchmark_class=TensorRTLLMBenchmark, 153 | runner_dict=runner_dict, 154 | benchmark_name="Nvidia-TRT-LLM", 155 | is_bench_pytorch=False, 156 | ) 157 | -------------------------------------------------------------------------------- /bench_candle/bench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ######################################################################################################## 4 | # Script: bench.sh 5 | # Description: This script runs benchmarks candle llama benchmark. 6 | # 7 | # Usage: ./bench.sh [OPTIONS] 8 | # OPTIONS: 9 | # -p, --prompt Prompt for benchmarks (default: 'Explain what is a transformer') 10 | # -r, --repetitions Number of repetitions for benchmarks (default: 2) 11 | # -m, --max_tokens Maximum number of tokens for benchmarks (default: 100) 12 | # -d, --device Device for benchmarks (possible values: 'metal', 'gpu', and 'cpu', default: 'cpu') 13 | # -lf, --log_file Logging file name. 14 | # -md, --models_dir Models directory. 15 | # -h, --help Show this help message 16 | ######################################################################################################## 17 | 18 | set -euo pipefail 19 | 20 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 21 | 22 | print_usage() { 23 | echo "Usage: $0 [OPTIONS]" 24 | echo "OPTIONS:" 25 | echo " -p, --prompt Prompt for benchmarks (default: 'Explain what is a transformer')" 26 | echo " -r, --repetitions Number of repetitions for benchmarks (default: 2)" 27 | echo " -m, --max_tokens Maximum number of tokens for benchmarks (default: 100)" 28 | echo " -d, --device Device for benchmarks (possible values: 'metal', 'gpu', and 'cpu', default: 'cpu')" 29 | echo " -lf, --log_file Logging file name." 30 | echo " -md, --models_dir Models directory." 31 | echo " -h, --help Show this help message" 32 | exit 1 33 | } 34 | 35 | check_cuda() { 36 | if command -v nvcc &> /dev/null 37 | then 38 | echo -e "\nUsing CUDA" 39 | nvcc --version 40 | else 41 | echo -e "\nCUDA is not available." 42 | exit 1 43 | fi 44 | } 45 | 46 | check_rust() { 47 | if which cargo &>/dev/null ; then 48 | echo -e "\nRust is installed. Using $(which cargo)" 49 | else 50 | echo -e "\nRust is not installed. Please install Rust before proceeding." 51 | exit 1 # Error exit code 52 | fi 53 | } 54 | 55 | check_platform() { 56 | local platform 57 | platform=$(uname -s) 58 | if [[ "$platform" == "Linux" ]]; then 59 | echo "Running on Linux." 60 | elif [[ "$platform" == "Darwin" ]]; then 61 | echo "Running on Mac OS." 62 | else 63 | echo "Unknown platform." 64 | exit 1 65 | fi 66 | } 67 | 68 | check_python() { 69 | if command -v python &> /dev/null 70 | then 71 | echo -e "\nUsing $(python --version)." 72 | else 73 | echo -e "\nPython does not exist." 74 | exit 1 75 | fi 76 | } 77 | 78 | setup() { 79 | echo -e "\nSetting up with $SCRIPT_DIR/setup.sh..." 80 | bash "$SCRIPT_DIR/setup.sh" "$1" 81 | } 82 | 83 | run_benchmarks() { 84 | local PROMPT="$1" 85 | local REPETITIONS="$2" 86 | local MAX_TOKENS="$3" 87 | local DEVICE="$4" 88 | local LOG_FILENAME="$5" 89 | local MODELS_DIR="$6" 90 | 91 | if [ "$DEVICE" == "cpu" ] || [ "$DEVICE" == "cuda" ]; then 92 | [ "$DEVICE" == "cuda" ] && CARGO_CANDLE_FEATURES="--features cuda" 93 | 94 | cargo run --release "$CARGO_CANDLE_FEATURES" \ 95 | --manifest-path="$SCRIPT_DIR/llama2-candle/Cargo.toml" \ 96 | -- --local-weights "$MODELS_DIR/llama-2-7b-st/" \ 97 | --repetitions "$REPETITIONS" \ 98 | --prompt "$PROMPT" \ 99 | --sample-len "$MAX_TOKENS" \ 100 | --log-file "$LOG_FILENAME" 101 | fi 102 | } 103 | # Parse command-line arguments 104 | while [ "$#" -gt 0 ]; do 105 | case "$1" in 106 | -p|--prompt) 107 | PROMPT="$2" 108 | shift 2 109 | ;; 110 | -r|--repetitions) 111 | REPETITIONS="$2" 112 | shift 2 113 | ;; 114 | -m|--max_tokens) 115 | MAX_TOKENS="$2" 116 | shift 2 117 | ;; 118 | -d|--device) 119 | DEVICE="$2" 120 | case "$DEVICE" in 121 | "cuda" | "metal" | "cpu") 122 | ;; 123 | *) 124 | echo "Invalid value for --device. Please use 'cuda', 'gpu' or 'cpu'." 125 | print_usage 126 | ;; 127 | esac 128 | if [ "$DEVICE" == "cuda" ]; then 129 | check_cuda 130 | fi 131 | if [ "$DEVICE" == "metal" ]; then 132 | echo "Metal not supported!" 133 | exit 0 134 | fi 135 | shift 2 136 | ;; 137 | -lf|--log_file) 138 | LOG_FILENAME="$2" 139 | shift 2 140 | ;; 141 | -md|--models_dir) 142 | MODELS_DIR="$2" 143 | shift 2 144 | ;; 145 | -h|--help) 146 | print_usage 147 | ;; 148 | *) 149 | echo "Unknown option: $1" 150 | print_usage 151 | ;; 152 | esac 153 | done 154 | 155 | # Set default values if not provided 156 | PROMPT="${PROMPT:-"Explain what is a transformer"}" 157 | REPETITIONS="${REPETITIONS:-10}" 158 | MAX_TOKENS="${MAX_TOKENS:-100}" 159 | DEVICE="${DEVICE:-'cpu'}" 160 | LOG_FILENAME="${LOG_FILENAME:-"benchmark_$(date +'%Y%m%d%H%M%S').log"}" 161 | MODELS_DIR="${MODELS_DIR:-"./models"}" 162 | 163 | check_platform 164 | check_rust 165 | check_python 166 | setup "$MODELS_DIR" 167 | run_benchmarks "$PROMPT" "$REPETITIONS" "$MAX_TOKENS" "$DEVICE" "$LOG_FILENAME" "$MODELS_DIR" 168 | -------------------------------------------------------------------------------- /bench_ctransformers/bench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ######################################################################################################## 4 | # Script: bench.sh 5 | # Description: This script runs benchmarks CTransformers benchmark. 6 | # 7 | # Usage: ./bench.sh [OPTIONS] 8 | # OPTIONS: 9 | # -p, --prompt Prompt for benchmarks (default: 'Write an essay about the transformer model architecture') 10 | # -r, --repetitions Number of repetitions for benchmarks (default: 10) 11 | # -m, --max_tokens Maximum number of tokens for benchmarks (default: 512) 12 | # -d, --device Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda') 13 | # -n, --model_name The name of the model to benchmark (possible values: 'llama' for using Llama2, 'mistral' for using Mistral 7B v0.1) 14 | # -lf, --log_file Logging file name. 15 | # -h, --help Show this help message 16 | ######################################################################################################## 17 | 18 | set -euo pipefail 19 | 20 | print_usage() { 21 | echo "Usage: $0 [OPTIONS]" 22 | echo "OPTIONS:" 23 | echo " -p, --prompt Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')" 24 | echo " -r, --repetitions Number of repetitions for benchmarks (default: 10)" 25 | echo " -m, --max_tokens Maximum number of tokens for benchmarks (default: 512)" 26 | echo " -d, --device Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')" 27 | echo " -n, --model_name The name of the model to benchmark (possible values: 'llama' for using Llama2, 'mistral' for using Mistral 7B v0.1)" 28 | echo " -lf, --log_file Logging file name." 29 | echo " -h, --help Show this help message" 30 | exit 1 31 | } 32 | 33 | 34 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 35 | 36 | 37 | check_cuda() { 38 | if command -v nvcc &> /dev/null 39 | then 40 | echo -e "\nUsing CUDA" 41 | nvcc --version 42 | else 43 | echo -e "\nCUDA is not available." 44 | exit 1 45 | fi 46 | } 47 | 48 | check_platform() { 49 | local platform 50 | platform=$(uname -s) 51 | if [[ "$platform" == "Linux" ]]; then 52 | echo "Running on Linux." 53 | elif [[ "$platform" == "Darwin" ]]; then 54 | echo "Running on Mac OS." 55 | else 56 | echo "Unknown platform." 57 | exit 1 58 | fi 59 | } 60 | 61 | check_python() { 62 | if command -v python &> /dev/null; then 63 | PYTHON_CMD="python" 64 | elif command -v python3 &> /dev/null; then 65 | PYTHON_CMD="python3" 66 | else 67 | echo "Python is not installed." 68 | exit 1 69 | fi 70 | } 71 | 72 | setup() { 73 | local MODEL_NAME="${1:-llama}" 74 | local DEVICE="$2" 75 | 76 | echo -e "\nSetting up with $SCRIPT_DIR/setup.sh..." 77 | case "$DEVICE" in 78 | cuda) 79 | bash "$SCRIPT_DIR/setup.sh" "$DEVICE" "$MODEL_NAME" 80 | ;; 81 | metal) 82 | bash "$SCRIPT_DIR/setup.sh" "$DEVICE" "$MODEL_NAME" 83 | ;; 84 | cpu) 85 | bash "$SCRIPT_DIR/setup.sh" "$DEVICE" "$MODEL_NAME" 86 | ;; 87 | *) 88 | echo "Unsupported DEVICE: $DEVICE" 89 | exit 1 90 | ;; 91 | esac 92 | } 93 | 94 | run_benchmarks() { 95 | local PROMPT="$1" 96 | local REPETITIONS="$2" 97 | local MAX_TOKENS="$3" 98 | local DEVICE="$4" 99 | local MODEL_NAME="$5" 100 | 101 | # shellcheck disable=SC1091 102 | source "$SCRIPT_DIR/venv/bin/activate" 103 | "$PYTHON_CMD" "$SCRIPT_DIR"/bench.py \ 104 | --prompt "$PROMPT" \ 105 | --repetitions "$REPETITIONS" \ 106 | --max_tokens "$MAX_TOKENS" \ 107 | --model_name "$MODEL_NAME" \ 108 | --device "$DEVICE" 109 | } 110 | 111 | # Parse command-line arguments 112 | while [ "$#" -gt 0 ]; do 113 | case "$1" in 114 | -p|--prompt) 115 | PROMPT="$2" 116 | shift 2 117 | ;; 118 | -r|--repetitions) 119 | REPETITIONS="$2" 120 | shift 2 121 | ;; 122 | -m|--max_tokens) 123 | MAX_TOKENS="$2" 124 | shift 2 125 | ;; 126 | -d|--device) 127 | DEVICE="$2" 128 | case "$DEVICE" in 129 | "cuda" | "metal" | "cpu") 130 | ;; 131 | *) 132 | echo "Invalid value for --device. Please use 'cuda', 'cpu' or 'metal'." 133 | print_usage 134 | ;; 135 | esac 136 | if [ "$DEVICE" == "cuda" ]; then 137 | check_cuda 138 | else 139 | echo "Not supported for $DEVICE" 140 | exit 1 141 | fi 142 | shift 2 143 | ;; 144 | -n|--model_name) 145 | MODEL_NAME="$2" 146 | shift 2 147 | ;; 148 | -h|--help) 149 | print_usage 150 | ;; 151 | *) 152 | echo "Unknown option: $1" 153 | print_usage 154 | ;; 155 | esac 156 | done 157 | 158 | check_platform 159 | check_python 160 | setup "$MODEL_NAME" "$DEVICE" 161 | 162 | # Set default values if not provided 163 | PROMPT="${PROMPT:-"Write an essay about the transformer model architecture"}" 164 | REPETITIONS="${REPETITIONS:-10}" 165 | MAX_TOKENS="${MAX_TOKENS:-512}" 166 | DEVICE="${DEVICE:-'cuda'}" 167 | MODEL_NAME="${MODEL_NAME:-"llama"}" 168 | 169 | run_benchmarks "$PROMPT" "$REPETITIONS" "$MAX_TOKENS" "$DEVICE" "$MODEL_NAME" 170 | -------------------------------------------------------------------------------- /bench_pytorch/bench.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import torch 5 | from transformers import AutoModelForCausalLM, AutoTokenizer 6 | 7 | sys.path.append(os.getcwd()) 8 | 9 | from common.base import BaseBenchmarkClass # noqa 10 | from common.utils import launch_cli, make_report # noqa 11 | 12 | 13 | class PyTorchBenchmark(BaseBenchmarkClass): 14 | def __init__( 15 | self, 16 | model_path: str, 17 | model_name: str, 18 | benchmark_name: str, 19 | precision: str, 20 | device: str, 21 | experiment_name: str, 22 | ) -> None: 23 | super().__init__( 24 | model_name=model_name, 25 | model_path=model_path, 26 | benchmark_name=benchmark_name, 27 | experiment_name=experiment_name, 28 | precision=precision, 29 | device=device, 30 | ) 31 | 32 | @torch.inference_mode() 33 | def load_model_and_tokenizer(self): 34 | self.tokenizer = AutoTokenizer.from_pretrained(self.model_path) 35 | precision_dtype_mapping = {"float16": torch.float16, "float32": torch.float32} 36 | 37 | if self.precision in ["float16", "float32"]: 38 | device = "cuda:0" if self.device == "cuda" else self.device 39 | model_args = { 40 | "device_map": device, 41 | "torch_dtype": precision_dtype_mapping[self.precision], 42 | } 43 | self.model = AutoModelForCausalLM.from_pretrained( 44 | self.model_path, **model_args 45 | ) 46 | elif self.precision in ["int4", "int8"] and self.device in ["cuda:0", "cuda"]: 47 | from transformers import BitsAndBytesConfig 48 | 49 | bnb_config = ( 50 | BitsAndBytesConfig(load_in_8bit=True) 51 | if self.precision == "int8" 52 | else BitsAndBytesConfig( 53 | load_in_4bit=True, 54 | bnb_4bit_use_double_quant=True, 55 | bnb_4bit_quant_type="nf4", 56 | bnb_4bit_compute_dtype=torch.float16, 57 | ) 58 | ) 59 | 60 | if self.precision == "int8": 61 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 62 | 63 | self.model = AutoModelForCausalLM.from_pretrained( 64 | self.model_path, device_map=self.device, quantization_config=bnb_config 65 | ) 66 | else: 67 | raise ValueError( 68 | f"Invalid configuration: {self.device}, {self.precision}" 69 | "INT4/8 requires CUDA to execute." 70 | ) 71 | return self 72 | 73 | def preprocess(self, prompt: str, chat_mode: bool = True, for_benchmarks=True): 74 | if chat_mode: 75 | template = self.get_chat_template_with_instruction( 76 | prompt=prompt, for_benchmarks=for_benchmarks 77 | ) 78 | prompt = self.tokenizer.apply_chat_template(template, tokenize=False) 79 | 80 | tokenized_input = self.tokenizer.encode(text=prompt) 81 | tensor = self.tokenizer.encode(prompt, return_tensors="pt").to(self.device) 82 | return { 83 | "prompt": prompt, 84 | "input_tokens": tokenized_input, 85 | "tensor": tensor, 86 | "num_input_tokens": len(tokenized_input), 87 | } 88 | 89 | def run_model(self, inputs: dict, max_tokens: int, temperature: float) -> dict: 90 | tensor = inputs["tensor"] 91 | num_input_tokens = inputs["num_input_tokens"] 92 | 93 | output = ( 94 | self.model.generate( 95 | input_ids=tensor, 96 | max_new_tokens=max_tokens, 97 | temperature=temperature, 98 | do_sample=True, 99 | pad_token_id=self.tokenizer.eos_token_id, 100 | ) 101 | .detach() 102 | .tolist()[0] 103 | ) 104 | 105 | output_tokens = ( 106 | output[num_input_tokens:] if len(output) > num_input_tokens else output 107 | ) 108 | return {"output_tokens": output_tokens, "num_output_tokens": len(output_tokens)} 109 | 110 | def postprocess(self, output: dict) -> str: 111 | output_tokens = output["output_tokens"] 112 | return self.tokenizer.decode(output_tokens, skip_special_tokens=True) 113 | 114 | def on_exit(self): 115 | if self.device == "cuda:0": 116 | del self.model 117 | torch.cuda.synchronize() 118 | else: 119 | del self.model 120 | 121 | 122 | if __name__ == "__main__": 123 | parser = launch_cli( 124 | description="HuggingFace Transformers Benchmark (PyTorch backend)" 125 | ) 126 | args = parser.parse_args() 127 | model_folder = os.path.join(os.getcwd(), "models") 128 | model_name = ( 129 | f"{args.model_name}-2-7b-chat-hf" 130 | if args.model_name == "llama" 131 | else f"{args.model_name}-7b-v0.1-instruct-hf" 132 | ) 133 | model_path = os.path.join(model_folder, model_name) 134 | precisions_mapping = { 135 | "cpu": ("float32",), 136 | "cuda": ("float32", "float16", "int8", "int4"), 137 | "metal": ("float32", "float16"), 138 | } 139 | runner_dict = {} 140 | for device, precisions in precisions_mapping.items(): 141 | runner_dict[device] = [ 142 | {"precision": precision, "model_path": model_path} 143 | for precision in precisions 144 | ] 145 | report = make_report( 146 | args=args, 147 | benchmark_class=PyTorchBenchmark, 148 | runner_dict=runner_dict, 149 | benchmark_name="HF-Transformers (PyTorch Backend)", 150 | is_bench_pytorch=True, 151 | ) 152 | -------------------------------------------------------------------------------- /bench_ctranslate/bench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ######################################################################################################## 4 | # Script: bench.sh 5 | # Description: This script runs benchmarks CTranslate2 llama benchmark. 6 | # 7 | # Usage: ./bench.sh [OPTIONS] 8 | # OPTIONS: 9 | # -p, --prompt Prompt for benchmarks (default: 'Write an essay about the transformer model architecture') 10 | # -r, --repetitions Number of repetitions for benchmarks (default: 10) 11 | # -m, --max_tokens Maximum number of tokens for benchmarks (default: 512) 12 | # -d, --device Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda') 13 | # -n, --model_name The name of the model to benchmark (possible values: 'llama' for using Llama2, 'mistral' for using Mistral 7B v0.1) 14 | # -lf, --log_file Logging file name. 15 | # -h, --help Show this help message 16 | ######################################################################################################## 17 | 18 | set -euo pipefail 19 | 20 | print_usage() { 21 | echo "Usage: $0 [OPTIONS]" 22 | echo "OPTIONS:" 23 | echo " -p, --prompt Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')" 24 | echo " -r, --repetitions Number of repetitions for benchmarks (default: 10)" 25 | echo " -m, --max_tokens Maximum number of tokens for benchmarks (default: 512)" 26 | echo " -d, --device Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')" 27 | echo " -n, --model_name The name of the model to benchmark (possible values: 'llama' for using Llama2, 'mistral' for using Mistral 7B v0.1)" 28 | echo " -lf, --log_file Logging file name." 29 | echo " -h, --help Show this help message" 30 | exit 1 31 | } 32 | 33 | CURRENT_DIR="$(pwd)" 34 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 35 | 36 | echo "$SCRIPT_DIR" 37 | 38 | check_platform() { 39 | local platform 40 | platform=$(uname -s) 41 | if [[ "$platform" == "Linux" ]]; then 42 | echo "Running on Linux." 43 | elif [[ "$platform" == "Darwin" ]]; then 44 | echo "Running on Mac OS." 45 | else 46 | echo "Unknown platform." 47 | exit 1 48 | fi 49 | } 50 | 51 | check_cuda() { 52 | if command -v nvcc &> /dev/null 53 | then 54 | echo -e "\nUsing CUDA" 55 | nvcc --version 56 | else 57 | echo -e "\nCUDA is not available." 58 | exit 1 59 | fi 60 | } 61 | 62 | check_python() { 63 | if command -v python &> /dev/null; then 64 | PYTHON_CMD="python" 65 | elif command -v python3 &> /dev/null; then 66 | PYTHON_CMD="python3" 67 | else 68 | echo "Python is not installed." 69 | exit 1 70 | fi 71 | } 72 | 73 | setup() { 74 | local MODEL_NAME="${1:-llama}" 75 | 76 | if [[ "$MODEL_NAME" == "llama" ]]; then 77 | local model_pattern="$CURRENT_DIR/models/llama-2-7b-chat-ctranslate2-*" 78 | elif [[ "$MODEL_NAME" == "mistral" ]]; then 79 | local model_pattern="$CURRENT_DIR/models/mistral-7b-v0.1-instruct-ctranslate2-*" 80 | else 81 | echo "No such model is supported" 82 | exit 1 83 | fi 84 | 85 | matching_dirs=$(ls -d "$model_pattern" 2>/dev/null) 86 | 87 | if [ -n "$matching_dirs" ]; then 88 | echo "Already exists skipping setup" 89 | else 90 | echo -e "\nSetting up with $SCRIPT_DIR/setup.sh..." 91 | bash "$SCRIPT_DIR"/setup.sh "$MODEL_NAME" 92 | fi 93 | } 94 | 95 | run_benchmarks() { 96 | local PROMPT="$1" 97 | local REPETITIONS="$2" 98 | local MAX_TOKENS="$3" 99 | local DEVICE="$4" 100 | local MODEL_NAME="$5" 101 | 102 | # shellcheck disable=SC1091 103 | source "$SCRIPT_DIR/venv/bin/activate" 104 | "$PYTHON_CMD" "$SCRIPT_DIR"/bench.py \ 105 | --prompt "$PROMPT" \ 106 | --repetitions "$REPETITIONS" \ 107 | --max_tokens "$MAX_TOKENS" \ 108 | --model_name "$MODEL_NAME" \ 109 | --device "$DEVICE" 110 | } 111 | 112 | # Parse command-line arguments 113 | while [ "$#" -gt 0 ]; do 114 | case "$1" in 115 | -p|--prompt) 116 | PROMPT="$2" 117 | shift 2 118 | ;; 119 | -r|--repetitions) 120 | REPETITIONS="$2" 121 | shift 2 122 | ;; 123 | -m|--max_tokens) 124 | MAX_TOKENS="$2" 125 | shift 2 126 | ;; 127 | -d|--device) 128 | DEVICE="$2" 129 | case "$DEVICE" in 130 | "cuda" | "metal" | "cpu") 131 | ;; 132 | *) 133 | echo "Invalid value for --device. Please use 'cuda', 'cpu' or 'metal'." 134 | print_usage 135 | ;; 136 | esac 137 | if [ "$DEVICE" == "cuda" ]; then 138 | check_cuda 139 | else 140 | echo "Not supported for $DEVICE" 141 | exit 1 142 | fi 143 | shift 2 144 | ;; 145 | -n|--model_name) 146 | MODEL_NAME="$2" 147 | shift 2 148 | ;; 149 | -h|--help) 150 | print_usage 151 | ;; 152 | *) 153 | echo "Unknown option: $1" 154 | print_usage 155 | ;; 156 | esac 157 | done 158 | 159 | 160 | # Set default values if not provided 161 | PROMPT="${PROMPT:-"Write an essay about the transformer model architecture"}" 162 | REPETITIONS="${REPETITIONS:-10}" 163 | MAX_TOKENS="${MAX_TOKENS:-512}" 164 | DEVICE="${DEVICE:-'cuda'}" 165 | MODEL_NAME="${MODEL_NAME:-"llama"}" 166 | 167 | check_platform 168 | check_python 169 | setup "$MODEL_NAME" 170 | 171 | run_benchmarks "$PROMPT" "$REPETITIONS" "$MAX_TOKENS" "$DEVICE" "$MODEL_NAME" 172 | -------------------------------------------------------------------------------- /common/utils.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import logging 4 | import os 5 | import sys 6 | from collections import defaultdict 7 | from datetime import datetime 8 | 9 | import numpy as np 10 | 11 | 12 | def get_logger( 13 | benchmark_name: str, log_file_path: str = None, logging_level=logging.INFO 14 | ): 15 | logger = logging.getLogger(benchmark_name) 16 | if not logger.handlers: # Check if handlers have already been added 17 | logger.setLevel(logging_level) 18 | 19 | formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") 20 | 21 | stream_handler = logging.StreamHandler(sys.stdout) 22 | stream_handler.setFormatter(formatter) 23 | logger.addHandler(stream_handler) 24 | 25 | if log_file_path is None: 26 | logfile_name = f"benchmark_{benchmark_name}_{datetime.now().strftime('%Y%m%d%H%M%S')}.log" 27 | log_file_path = os.path.join(os.getcwd(), "logs", logfile_name) 28 | 29 | file_handler = logging.FileHandler(log_file_path) 30 | file_handler.setFormatter(formatter) 31 | logger.addHandler(file_handler) 32 | 33 | return logger 34 | 35 | 36 | def launch_cli(description: str): 37 | parser = argparse.ArgumentParser(description=description) 38 | parser.add_argument( 39 | "--prompt", 40 | type=str, 41 | help="The prompt for the model.", 42 | ) 43 | parser.add_argument("--max_tokens", type=int, help="The maximum number of tokens.") 44 | 45 | parser.add_argument( 46 | "--repetitions", 47 | type=int, 48 | help="The number of repetitions for the benchmark.", 49 | ) 50 | parser.add_argument( 51 | "--device", 52 | help="Device to use for the benchmark.", 53 | ) 54 | parser.add_argument( 55 | "--model_name", 56 | type=str, 57 | help="Path to the models directory.", 58 | ) 59 | 60 | parser.add_argument( 61 | "--temperature", 62 | type=float, 63 | help="Temperature to use.", 64 | ) 65 | 66 | return parser 67 | 68 | 69 | def make_report( 70 | args, benchmark_class, runner_dict, benchmark_name, is_bench_pytorch: bool = False 71 | ): 72 | experiment_name = f"{benchmark_name}-{str(datetime.now())}" 73 | report = defaultdict(lambda: defaultdict(float)) 74 | all_answers = {} 75 | 76 | for instance in runner_dict[args.device]: 77 | model_path, precision = instance["model_path"], instance["precision"] 78 | benchmark = benchmark_class( 79 | model_path=model_path, 80 | model_name=args.model_name, 81 | benchmark_name=benchmark_name, 82 | precision=precision, 83 | device=args.device, 84 | experiment_name=experiment_name, 85 | ).load_model_and_tokenizer() 86 | 87 | logger = benchmark.logger 88 | 89 | # First we do benchmarking 90 | benchmark.benchmark( 91 | prompt=args.prompt, 92 | max_tokens=args.max_tokens, 93 | repetitions=args.repetitions, 94 | temperature=args.temperature, 95 | ) 96 | 97 | # Make report for benchmarks 98 | # Memory seems to be stay the same, so we can take the max of it 99 | 100 | report[f"{args.model_name}-{benchmark_name} (token/sec)"][precision] = { 101 | "mean": np.mean(benchmark.tps_results), 102 | "std": np.std(benchmark.tps_results), 103 | } 104 | 105 | report[f"{args.model_name}-{benchmark_name} (memory usage)"][precision] = { 106 | "usage": max(benchmark.memory_usage_results) 107 | } 108 | 109 | # Second we get the answers 110 | benchmark.get_answers() 111 | all_answers[precision] = benchmark.answers 112 | 113 | # Make the final report 114 | 115 | for framework, quantizations in report.items(): 116 | for quantization, stats in quantizations.items(): 117 | if framework == f"{args.model_name}-{benchmark_name} (memory usage)": 118 | logger.info(f"{framework}, {quantization}: {stats['usage']} MB") 119 | else: 120 | logger.info( 121 | f"{framework}, {quantization}: {stats['mean']:.2f} ± {stats['std']:.2f}" 122 | ) 123 | # Finally write the quality checks results 124 | logger.info("Writing the model completion for empirical tests") 125 | with open(benchmark.answers_json_path, "w") as json_file: 126 | json.dump(all_answers, json_file) 127 | 128 | logger.info("Benchmarking Fininshed") 129 | markdown_content = make_markdown( 130 | input_json_path=benchmark.answers_json_path, is_bench_pytorch=is_bench_pytorch 131 | ) 132 | 133 | with open(os.path.join(benchmark.log_folder, "quality.md"), "w") as readme_file: 134 | readme_file.write("\n".join(markdown_content)) 135 | 136 | print("README.md has been created with the table.") 137 | 138 | 139 | def make_markdown(input_json_path: str, is_bench_pytorch: bool = False): 140 | with open(input_json_path, "r") as file: 141 | data = json.load(file) 142 | 143 | precisions = list(data.keys()) 144 | markdown_content = [] 145 | 146 | # Helper function to create a markdown table row 147 | def create_row(items): 148 | return "| " + " | ".join(items) + " |" 149 | 150 | # Build headers based on the mode 151 | if is_bench_pytorch: 152 | headers = ["Question"] + precisions 153 | else: 154 | headers = ["Question"] + precisions + ["Ground Truth"] 155 | 156 | markdown_content.append(create_row(headers)) 157 | markdown_content.append(create_row(["---"] * len(headers))) 158 | 159 | # Build the Markdown 160 | for idx, question in enumerate(data[precisions[0]]): 161 | question_text = question.get( 162 | "prompt" if is_bench_pytorch else "question", "" 163 | ).replace("\n", " ") 164 | 165 | answers = [ 166 | data[precision][idx]["actual"].replace("\n", "
") 167 | for precision in precisions 168 | ] 169 | row_items = [question_text] + answers 170 | 171 | if not is_bench_pytorch: 172 | ground_truths = [ 173 | data[precision][idx]["expected"].replace("\n", "
") 174 | for precision in precisions 175 | ] 176 | row_items += ground_truths 177 | markdown_content.append(create_row(row_items)) 178 | 179 | return markdown_content 180 | --------------------------------------------------------------------------------