├── common
    ├── __init__.py
    ├── memory_tracker.py
    └── utils.py
├── models
    └── .gitkeep
├── logs
    ├── llama
    │   └── .gitkeep
    └── mistral
    │   └── .gitkeep
├── bench_lightning
    ├── requirements.txt
    ├── setup.sh
    ├── bench.py
    └── bench.sh
├── image.png
├── bench_candle
    ├── requirements.txt
    ├── llama2-candle
    │   └── Cargo.toml
    ├── setup.sh
    ├── README.md
    ├── convert_to_safetensors.py
    └── bench.sh
├── bench_deepspeed
    ├── requirements.txt
    ├── setup.sh
    ├── bench.py
    └── bench.sh
├── bench_exllamav2
    ├── requirements.txt
    ├── setup.sh
    ├── bench.py
    └── bench.sh
├── bench_ctransformers
    ├── requirements.txt
    ├── bench.py
    ├── setup.sh
    └── bench.sh
├── bench_ctranslate
    ├── requirements.txt
    ├── setup.sh
    ├── bench.py
    └── bench.sh
├── setup.cfg
├── bench_llamacpp
    ├── requirements.txt
    ├── setup.sh
    ├── bench.py
    └── bench.sh
├── bench_autoawq
    ├── requirements.txt
    ├── setup.sh
    ├── bench.py
    └── bench.sh
├── bench_pytorch
    ├── requirements.txt
    ├── setup.sh
    ├── bench.sh
    └── bench.py
├── bench_autogptq
    ├── requirements.txt
    ├── setup.sh
    ├── bench.sh
    └── bench.py
├── .github
    └── workflows
    │   └── precommit.yaml
├── models.json
├── .pre-commit-config.yaml
├── LICENSE
├── bench_optimum_nvidia
    ├── converter.py
    ├── setup.sh
    ├── bench.py
    └── bench.sh
├── bench_onnxruntime
    ├── setup.sh
    ├── bench.sh
    └── bench.py
├── benchmark.sh
├── download.sh
├── questions.json
├── docs
    ├── archive.md
    └── ml_engines.md
├── .gitignore
├── bench_tensorrtllm
    ├── bench.sh
    └── bench.py
└── bench_vllm
    ├── bench.py
    ├── bench.sh
    └── setup.sh


/common/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/models/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/logs/llama/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/logs/mistral/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/bench_lightning/requirements.txt:
--------------------------------------------------------------------------------
1 | scipy==1.13.0
2 | bitsandbytes==0.43.1
3 | 


--------------------------------------------------------------------------------
/image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/premAI-io/benchmarks/HEAD/image.png


--------------------------------------------------------------------------------
/bench_candle/requirements.txt:
--------------------------------------------------------------------------------
1 | torch==2.1.0
2 | safetensors==0.4.0
3 | numpy==1.26.2
4 | 


--------------------------------------------------------------------------------
/bench_deepspeed/requirements.txt:
--------------------------------------------------------------------------------
1 | deepspeed-mii==0.2.3
2 | mpi4py==3.1.5
3 | sentencepiece==0.2.0
4 | 


--------------------------------------------------------------------------------
/bench_exllamav2/requirements.txt:
--------------------------------------------------------------------------------
1 | exllamav2==0.0.19
2 | tqdm==4.66.2
3 | tokenizers==0.15.2
4 | transformers==4.40.0
5 | 


--------------------------------------------------------------------------------
/bench_ctransformers/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy==1.26.4
2 | huggingface-hub>=0.17.1
3 | transformers==4.38.2
4 | torch==2.2.2
5 | 


--------------------------------------------------------------------------------
/bench_ctranslate/requirements.txt:
--------------------------------------------------------------------------------
1 | sentencepiece==0.1.99
2 | ctranslate2==4.1.0
3 | transformers==4.35.0
4 | torch==2.1.0
5 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 120
3 | exclude = .tox,.git,*/migrations/*,*/static/CACHE/*,docs,node_modules,venv
4 | 


--------------------------------------------------------------------------------
/bench_llamacpp/requirements.txt:
--------------------------------------------------------------------------------
1 | llama_cpp_python==0.2.62
2 | huggingface_hub==0.22.2
3 | transformers==4.39.3
4 | torch==2.2.2
5 | 


--------------------------------------------------------------------------------
/bench_autoawq/requirements.txt:
--------------------------------------------------------------------------------
1 | torch==2.2.2
2 | accelerate==0.28.0
3 | transformers==4.38.2
4 | optimum==1.18.0
5 | autoawq==0.2.4
6 | autoawq-kernels==0.0.6
7 | 


--------------------------------------------------------------------------------
/bench_pytorch/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers==4.39.3
2 | torch==2.2.2
3 | accelerate==0.28.0
4 | sentencepiece==0.2.0
5 | protobuf==0.2.0
6 | bitsandbytes==0.43.1
7 | 


--------------------------------------------------------------------------------
/bench_autogptq/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy==1.26.4
2 | gekko==1.1.0
3 | pandas==2.2.1
4 | huggingface_hub==0.22.2
5 | torch==2.2.1
6 | transformers==4.38.2
7 | fsspec[http]>=2023.1.0,<=2024.2.0
8 | 


--------------------------------------------------------------------------------
/.github/workflows/precommit.yaml:
--------------------------------------------------------------------------------
 1 | name: pre-commit
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches: [main, dev]
 6 |   push:
 7 |     branches: [main, dev]
 8 | 
 9 | jobs:
10 |   pre-commit:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |     - uses: actions/checkout@v3
14 |     - uses: actions/setup-python@v3
15 |     - uses: pre-commit/action@v3.0.0
16 | 


--------------------------------------------------------------------------------
/models.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "url": "https://prem-models.s3.eu-central-1.amazonaws.com/llama-v2/Llama-2-7b-chat-hf.zip",
 4 |         "file": "llama-2-7b-chat-hf.zip",
 5 |         "folder": "./models/llama-2-7b-chat-hf"
 6 |     },
 7 |     {
 8 |         "url": "https://prem-models.s3.eu-central-1.amazonaws.com/mistral-0.1/Mistral-7B-Instruct-v0.1.zip",
 9 |         "file": "mistral-7b-v0.1-instruct.zip",
10 |         "folder": "./models/mistral-7b-v0.1-instruct-hf"
11 |     }
12 | ]
13 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | default_stages: [commit]
 2 | 
 3 | repos:
 4 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 5 |     rev: v4.5.0
 6 |     hooks:
 7 |       - id: trailing-whitespace
 8 |       - id: end-of-file-fixer
 9 |       - id: check-yaml
10 |       - id: end-of-file-fixer
11 |       - id: check-toml
12 |       - id: check-xml
13 |       - id: debug-statements
14 |       - id: check-builtin-literals
15 |       - id: check-case-conflict
16 | 
17 |   - repo: https://github.com/psf/black
18 |     rev: 23.11.0
19 |     hooks:
20 |       - id: black
21 | 
22 |   - repo: https://github.com/PyCQA/isort
23 |     rev: 5.12.0
24 |     hooks:
25 |       - id: isort
26 | 
27 |   - repo: https://github.com/PyCQA/flake8
28 |     rev: 6.1.0
29 |     hooks:
30 |       - id: flake8
31 |         args: ["--config=setup.cfg"]
32 |         additional_dependencies: [flake8-isort]
33 | 
34 |   - repo: https://github.com/shellcheck-py/shellcheck-py
35 |     rev: v0.9.0.6
36 |     hooks:
37 |     -   id: shellcheck
38 | 
39 | ci:
40 |   autoupdate_schedule: weekly
41 |   skip: []
42 |   submodules: false
43 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Prem
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/bench_candle/llama2-candle/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "llama2-candle"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | 
 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 7 | 
 8 | [dependencies]
 9 | accelerate-src = { version = "0.3.2", optional = true }
10 | anyhow = { version = "1", features = ["backtrace"] }
11 | candle = { version = "0.5.1", package = "candle-core" }
12 | candle-examples = { version = "0.5.1", package = "candle-examples" }
13 | candle-nn = { version = "0.5.1", package = "candle-nn" }
14 | candle-transformers = { version = "0.5.1", package = "candle-transformers" }
15 | clap = { version = "4.2.4", features = ["derive"] }
16 | env_logger = "0.10.0"
17 | hf-hub = "0.3.2"
18 | imageproc = { version = "0.23.0", default-features = false }
19 | log = "0.4"
20 | rand = "0.8.5"
21 | rusttype = { version = "0.9", default-features = false }
22 | serde_json = "1.0.99"
23 | tokenizers = { version = "0.19.1", features = ["onig"] }
24 | tracing-chrome = "0.7.1"
25 | tracing-subscriber = "0.3.7"
26 | 
27 | [features]
28 | accelerate = ["dep:accelerate-src", "candle/accelerate", "candle-nn/accelerate", "candle-transformers/accelerate"]
29 | cuda = ["candle/cuda", "candle-nn/cuda", "candle-transformers/cuda"]
30 | 


--------------------------------------------------------------------------------
/bench_deepspeed/setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ################################################################################
 4 | # Script: setup.sh
 5 | # Description: Automates the setup of a virtual environment and installs project
 6 | # requirements.
 7 | ################################################################################
 8 | 
 9 | set -euo pipefail
10 | 
11 | # Main script starts here.
12 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
13 | VENV_DIR="$SCRIPT_DIR/venv"
14 | 
15 | check_python() {
16 |     if command -v python &> /dev/null; then
17 |         PYTHON_CMD="python"
18 |     elif command -v python3 &> /dev/null; then
19 |         PYTHON_CMD="python3"
20 |     else
21 |         echo "Python is not installed."
22 |         exit 1
23 |     fi
24 | }
25 | 
26 | check_python
27 | 
28 | if [ ! -d "$VENV_DIR" ]; then
29 |     "$PYTHON_CMD" -m venv "$VENV_DIR"
30 |     echo "Virtual environment '$VENV_DIR' created."
31 |     # shellcheck disable=SC1091
32 |     source "$VENV_DIR/bin/activate"
33 |     "$PYTHON_CMD" -m pip install --upgrade pip > /dev/null
34 |     "$PYTHON_CMD" -m pip install -r "$SCRIPT_DIR/requirements.txt" --no-cache-dir > /dev/null
35 | else
36 |     # shellcheck disable=SC1091
37 |     source "$VENV_DIR/bin/activate"
38 | fi
39 | 


--------------------------------------------------------------------------------
/bench_pytorch/setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ################################################################################
 4 | # Script: setup.sh
 5 | # Description: Automates the setup of a virtual environment and installs project
 6 | # requirements.
 7 | ################################################################################
 8 | 
 9 | set -euo pipefail
10 | 
11 | # Main script starts here.
12 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
13 | VENV_DIR="$SCRIPT_DIR/venv"
14 | 
15 | check_python() {
16 |     if command -v python &> /dev/null; then
17 |         PYTHON_CMD="python"
18 |     elif command -v python3 &> /dev/null; then
19 |         PYTHON_CMD="python3"
20 |     else
21 |         echo "Python is not installed."
22 |         exit 1
23 |     fi
24 | }
25 | 
26 | check_python
27 | 
28 | if [ ! -d "$VENV_DIR" ]; then
29 |     "$PYTHON_CMD" -m venv "$VENV_DIR"
30 |     echo "Virtual environment '$VENV_DIR' created."
31 |     # shellcheck disable=SC1091
32 |     source "$VENV_DIR/bin/activate"
33 |     "$PYTHON_CMD" -m pip install --upgrade pip > /dev/null
34 |     "$PYTHON_CMD" -m pip install -r "$SCRIPT_DIR/requirements.txt" --no-cache-dir > /dev/null
35 | else
36 |     # shellcheck disable=SC1091
37 |     source "$VENV_DIR/bin/activate"
38 | fi
39 | 


--------------------------------------------------------------------------------
/bench_candle/setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ################################################################################
 4 | # Script: setup.sh <MODELS_FOLDER>
 5 | # Description: This script automates the setup of a virtual environment,
 6 | # installs project requirements, converts model.
 7 | ################################################################################
 8 | 
 9 | set -euo pipefail
10 | 
11 | if [ "$#" -ne 1 ]; then
12 |     echo "Usage: $0 <models_folder>"
13 |     exit 1
14 | fi
15 | 
16 | # Define directory paths
17 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
18 | VENV_DIR="$SCRIPT_DIR/venv"
19 | MODELS_FOLDER="$1"
20 | LLAMA_HF_MODEL_DIR="$MODELS_FOLDER/llama-2-7b-hf"
21 | LLAMA_ST_MODEL_DIR="$MODELS_FOLDER/llama-2-7b-st"
22 | 
23 | if [ ! -d "$VENV_DIR" ]; then
24 |     python -m venv "$VENV_DIR"
25 |     echo "Virtual environment '$VENV_DIR' created."
26 |     # shellcheck disable=SC1091
27 |     source "$VENV_DIR"/bin/activate
28 |     pip install --upgrade pip > /dev/null
29 |     pip install -r "$SCRIPT_DIR"/requirements.txt > /dev/null
30 | else
31 |     # shellcheck disable=SC1091
32 |     source "$VENV_DIR"/bin/activate
33 | fi
34 | 
35 | if [ ! -d "$LLAMA_ST_MODEL_DIR" ]; then
36 |     echo "Storing llama-2-7b-hf in safetensors format..."
37 |     python "$SCRIPT_DIR"/convert_to_safetensors.py --input_dir "$LLAMA_HF_MODEL_DIR" --output_dir "$LLAMA_ST_MODEL_DIR"
38 | else
39 |     echo "Model llama-2-7b-hf in safetensors format already exists!"
40 | fi
41 | 


--------------------------------------------------------------------------------
/bench_optimum_nvidia/converter.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | import os
 4 | 
 5 | import torch
 6 | from optimum.nvidia import AutoModelForCausalLM
 7 | 
 8 | # Some points to note:
 9 | # - the conversion is super simple, and it assumes batch size to be 1 and
10 | #   num beams to be 1
11 | # - it also assumes a standard prompt length of 512 tokens
12 | 
13 | 
14 | def build_engine(hf_model_path: str, out_model_dir: str, torch_dtype: str):
15 |     if not os.path.isdir(out_model_dir):
16 |         os.makedirs(out_model_dir, exist_ok=True)
17 | 
18 |     dtype_mapper = {"float16": torch.float16, "float32": torch.float32}
19 | 
20 |     try:
21 |         logging.info("Starting to build the model engine")
22 |         model = AutoModelForCausalLM.from_pretrained(
23 |             pretrained_model_name_or_path=hf_model_path,
24 |             max_batch_size=1,
25 |             max_prompt_length=512,
26 |             num_beams=1,
27 |             torch_dtype=dtype_mapper[torch_dtype],
28 |         )
29 | 
30 |         model.save_pretrained(save_directory=out_model_dir)
31 |     except Exception as e:
32 |         logging.info(f"Error: {e}")
33 |         os.rmdir(out_model_dir)
34 | 
35 | 
36 | if __name__ == "__main__":
37 |     parser = argparse.ArgumentParser("HF Optimum builder engine CLI")
38 |     parser.add_argument(
39 |         "--hf_dir",
40 |         type=str,
41 |         help="Hugging Face model weights path",
42 |     )
43 | 
44 |     parser.add_argument(
45 |         "--out_dir",
46 |         type=str,
47 |         help="The output engine dir",
48 |     )
49 | 
50 |     parser.add_argument(
51 |         "--dtype",
52 |         type=str,
53 |         help="The precision in which it will be saved. Supported: 'float16' and 'float32",
54 |     )
55 | 
56 |     args = parser.parse_args()
57 |     build_engine(
58 |         hf_model_path=args.hf_dir, out_model_dir=args.out_dir, torch_dtype=args.dtype
59 |     )
60 | 


--------------------------------------------------------------------------------
/bench_candle/README.md:
--------------------------------------------------------------------------------
 1 | # Candle
 2 | 
 3 | [![GitHub Repo](https://img.shields.io/badge/github-%23121011.svg?style=for-the-badge&logo=github&logoColor=white)](https://github.com/huggingface/candle) &nbsp;
 4 | 
 5 | [Candle](https://github.com/huggingface/candle) is a minimalistic Machine/Deep Learning framework written on Rust by [huggingface](https://github.com/huggingface). It tries to provide a simpler interface to implement models along with GPU support. This is a modified implementation of [Llama2-Candle example](https://github.com/huggingface/candle/blob/main/candle-examples/examples/llama/main.rs) to analyse the benchmark performance across different devices and precision.
 6 | 
 7 | 
 8 | ### 🚀 Running the Candle Benchmark.
 9 | 
10 | For running this benchmark, make sure you have [Rust installed](https://www.rust-lang.org/tools/install). You can run the Candle benchmark using the following command:
11 | 
12 | ```bash
13 | ./bench_candle/bench.sh \
14 |   --prompt <value> \            # Enter a prompt string
15 |   --max_tokens <value> \        # Maximum number of tokens to output
16 |   --repetitions <value> \       # Number of repititions to be made for the prompt.
17 |   --log_file <file_path> \      # A .log file underwhich we want to write the results.
18 |   --device <cpu/cuda/metal> \   # The device in which we want to benchmark.
19 |   --models_dir <path_to_models> # The directory in which model weights are present
20 | ```
21 | 
22 | To get started quickly you can simply run:
23 | 
24 | ```bash
25 | ./bench_candle/bench.sh -d cuda
26 | ```
27 | This will take all the default values (see in the [bench.sh](/bench_candle/bench.sh) file) and perform the benchmarks. You can find all the benchmarks results for Candle [here](/docs/llama2.md).
28 | 
29 | 
30 | ### 👀 Some points to note:
31 | 
32 | 1. Running this benchmark requires [HuggingFace Llama2-7B weights](https://huggingface.co/meta-llama/Llama-2-7b). So running this benchmark would assume that you already agreed to the required [terms and conditions](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) and got verified to download the weights.
33 | 2. Candle does not have support for Metal devices.
34 | 3. Candles does support [quantized models](https://github.com/huggingface/candle/blob/main/candle-examples/examples/quantized/main.rs). The benchmarks for quantized candles model will be available in the next versions.
35 | 


--------------------------------------------------------------------------------
/bench_optimum_nvidia/setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ################################################################################
 4 | # Script: setup.sh
 5 | # Description: Automates the setup of a virtual environment and installs project
 6 | # requirements.
 7 | ################################################################################
 8 | 
 9 | set -euo pipefail
10 | 
11 | # Main script starts here.
12 | 
13 | CURRENT_DIR="$(pwd)"
14 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
15 | 
16 | check_docker() {
17 |     if command -v docker &> /dev/null; then
18 |         return 0
19 |     else
20 |         return 1
21 |     fi
22 | }
23 | 
24 | 
25 | build_docker_image () {
26 |     # Check if the Docker image exists
27 |     if docker image inspect huggingface/optimum-nvidia:latest &> /dev/null; then
28 |         echo "Image 'huggingface/optimum-nvidia:latest' already exists."
29 |     else
30 |         docker pull huggingface/optimum-nvidia:latest
31 |     fi
32 | }
33 | 
34 | build_and_compile_model () {
35 |     echo "Running and building the model inside Docker..."
36 |     local MODEL_NAME="$1"
37 |     local PRECISION="$2"
38 | 
39 |     # Set the default folder paths for HF and engines
40 |     LLAMA2_WEIGHTS_FOLDER="/mnt/models/llama-2-7b-chat"
41 |     MISTRAL_WEIGHTS_FOLDER="/mnt/models/mistral-7b-v0.1-instruct"
42 | 
43 |     if [ "$MODEL_NAME" = "llama" ]; then
44 |         HF_DIR="$LLAMA2_WEIGHTS_FOLDER-hf"
45 |         ENGINE_DIR="$LLAMA2_WEIGHTS_FOLDER-optimum-$PRECISION"
46 |         OUT_DIR="$CURRENT_DIR/models/llama-2-7b-chat-optimum-$PRECISION"
47 | 
48 |     elif [ "$MODEL_NAME" = "mistral" ]; then
49 |         HF_DIR="$MISTRAL_WEIGHTS_FOLDER-hf"
50 |         ENGINE_DIR="$MISTRAL_WEIGHTS_FOLDER-optimum-$PRECISION"
51 |         OUT_DIR="$CURRENT_DIR/models/mistral-7b-v0.1-instruct-optimum-$PRECISION"
52 |     else
53 |         echo "Invalid MODEL_NAME. Supported values: 'llama', 'mistral'"
54 |         exit 1
55 |     fi
56 | 
57 |     if [ ! -d "$OUT_DIR" ]; then
58 |         docker run --gpus all \
59 |             --ipc=host \
60 |             --ulimit memlock=-1 \
61 |             --ulimit stack=67108864 \
62 |             -v "$CURRENT_DIR"/models:/mnt/models \
63 |             -v "$SCRIPT_DIR/converter.py":/mnt/converter.py \
64 |             huggingface/optimum-nvidia:latest \
65 |             python3 /mnt/converter.py --hf_dir "$HF_DIR" --out_dir "$ENGINE_DIR" --dtype "$PRECISION"
66 |     else
67 |         echo "Engine file already exists"
68 |     fi
69 | 
70 | }
71 | 
72 | 
73 | MODEL_NAME="${1:-"llama"}"
74 | 
75 | if check_docker; then
76 |     build_docker_image
77 |     build_and_compile_model "$MODEL_NAME" "float32"
78 |     build_and_compile_model "$MODEL_NAME" "float16"
79 | else
80 |     echo "Docker is not installed or not in the PATH"
81 | fi
82 | 


--------------------------------------------------------------------------------
/common/memory_tracker.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | from contextlib import contextmanager
 4 | from multiprocessing import Pipe, Process
 5 | from multiprocessing.connection import Connection
 6 | 
 7 | # Adapted from optimum-benchmark, I don't trust pytorch peak memory memory info when external libs are used.
 8 | # source: https://github.com/huggingface/optimum/blob/main/tests/benchmark/benchmark_gptq.py
 9 | 
10 | 
11 | class MemoryTracker:
12 |     def __init__(self):
13 |         self.peak_memory: int = 0
14 |         self.device_index = int(
15 |             os.environ.get("CUDA_VISIBLE_DEVICES", "0,").split(",")[0]
16 |         )
17 |         os.environ["TOKENIZERS_PARALLELISM"] = "true"
18 | 
19 |     @contextmanager
20 |     def track(self, interval: float = 0.1):
21 |         print(f"Tracking memory for device {self.device_index}")
22 |         yield from self._track_peak_memory(interval)
23 | 
24 |     def _track_peak_memory(self, interval: float):
25 |         child_connection, parent_connection = Pipe()
26 |         # instantiate process
27 |         mem_process: Process = PeakMemoryMeasureProcess(
28 |             self.device_index, child_connection, interval
29 |         )
30 |         mem_process.start()
31 |         # wait until we get memory
32 |         parent_connection.recv()
33 |         yield
34 |         # start parent connection
35 |         parent_connection.send(0)
36 |         # receive peak memory
37 |         self.peak_memory = parent_connection.recv()
38 | 
39 | 
40 | class PeakMemoryMeasureProcess(Process):
41 |     def __init__(
42 |         self, device_index: int, child_connection: Connection, interval: float
43 |     ):
44 |         super().__init__()
45 |         self.device_index = device_index
46 |         self.interval = interval
47 |         self.connection = child_connection
48 |         self.mem_usage = 0
49 | 
50 |     def run(self):
51 |         self.connection.send(0)
52 |         stop = False
53 | 
54 |         command = (
55 |             f"nvidia-smi --query-gpu=memory.used --format=csv --id={self.device_index}"
56 |         )
57 | 
58 |         while True:
59 |             # py3nvml is broken since it outputs only the reserved memory, and nvidia-smi has only the MiB precision.
60 |             gpu_mem_mb = (
61 |                 subprocess.check_output(command.split())
62 |                 .decode("ascii")
63 |                 .split("\n")[1]
64 |                 .split()[0]
65 |             )
66 |             gpu_mem_mb = int(gpu_mem_mb) * 1.048576
67 |             self.mem_usage = max(self.mem_usage, gpu_mem_mb)
68 | 
69 |             if stop:
70 |                 break
71 |             stop = self.connection.poll(self.interval)
72 | 
73 |         # send results to parent pipe
74 |         self.connection.send(self.mem_usage)
75 |         self.connection.close()
76 | 


--------------------------------------------------------------------------------
/bench_llamacpp/setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ################################################################################
 4 | # Script: setup.sh <DEVICE>
 5 | # Description: Automates the setup of a virtual environment and installs project
 6 | # requirements.
 7 | ################################################################################
 8 | 
 9 | set -euo pipefail
10 | 
11 | check_python() {
12 |     if command -v python &> /dev/null; then
13 |         PYTHON_CMD="python"
14 |     elif command -v python3 &> /dev/null; then
15 |         PYTHON_CMD="python3"
16 |     else
17 |         echo "Python is not installed."
18 |         exit 1
19 |     fi
20 | }
21 | 
22 | clone_and_build_llama() {
23 |     local DEVICE="$1"
24 |     local VENV_DIR="$2"
25 |     local SCRIPT_DIR="$3"
26 | 
27 |     if [ "$#" -ne 3 ]; then
28 |         echo "Usage: $0 <DEVICE> <ENV> <SCRIPT_DIR>"
29 |         exit 1
30 |     fi
31 | 
32 |     case "$DEVICE" in
33 |         cuda)
34 |             export LLAMA_CUBLAS=on
35 |             ;;
36 |         metal)
37 |             export LLAMA_METAL=on
38 |             ;;
39 |         cpu)
40 |             return 0
41 |             ;;
42 |         *)
43 |             echo "Unsupported DEVICE: $DEVICE"
44 |             return 1
45 |             ;;
46 |     esac
47 | 
48 |     local LIBLLAMA_FILE="$VENV_DIR/libllama_$DEVICE.so"
49 | 
50 |     if [ -e "$LIBLLAMA_FILE" ]; then
51 |         echo "File $LIBLLAMA_FILE exists."
52 |         exit 0
53 |     fi
54 | 
55 |     # Remove existing llama.cpp directory if it exists
56 |     if [ -d "$SCRIPT_DIR/llama.cpp" ]; then
57 |         echo "Removing existing llama.cpp directory..."
58 |         rm -rf "$SCRIPT_DIR"/llama.cpp
59 |     fi
60 | 
61 |     git clone --depth=1 https://github.com/ggerganov/llama.cpp "$SCRIPT_DIR"/llama.cpp
62 |     cd "$SCRIPT_DIR"/llama.cpp
63 | 
64 |     # Build llama.cpp
65 |     make clean > /dev/null
66 |     echo "Building llama.cpp..."
67 |     make libllama.so > /dev/null
68 |     cp libllama.so "$LIBLLAMA_FILE"
69 |     cd "$SCRIPT_DIR"
70 | 
71 |     rm -rf "$SCRIPT_DIR"/llama.cpp
72 | }
73 | 
74 | # CLI Args
75 | DEVICE="$1"
76 | 
77 | # Define directory paths
78 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
79 | VENV_DIR="$SCRIPT_DIR/venv"
80 | LIBLLAMA_FILE="$VENV_DIR/libllama_$DEVICE.so"
81 | 
82 | check_python
83 | 
84 | if [ ! -d "$VENV_DIR" ]; then
85 |     "$PYTHON_CMD" -m venv "$VENV_DIR"
86 |     echo "Virtual environment '$VENV_DIR' created."
87 |     # shellcheck disable=SC1091
88 |     source "$VENV_DIR/bin/activate"
89 |     pip install --upgrade pip > /dev/null
90 |     pip install -r "$SCRIPT_DIR/requirements.txt" --no-cache-dir > /dev/null
91 | else
92 |     # shellcheck disable=SC1091
93 |     source "$VENV_DIR/bin/activate"
94 | fi
95 | 
96 | clone_and_build_llama "$DEVICE" "$VENV_DIR" "$SCRIPT_DIR"
97 | 


--------------------------------------------------------------------------------
/bench_lightning/setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ################################################################################
 4 | # Script: setup.sh <DEVICE>
 5 | # Description: Automates the setup of a virtual environment and installs project
 6 | # requirements.
 7 | ################################################################################
 8 | 
 9 | CURRENT_DIR="$(pwd)"
10 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
11 | VENV_DIR="$SCRIPT_DIR/venv"
12 | 
13 | check_python() {
14 |     if command -v python &> /dev/null; then
15 |         PYTHON_CMD="python"
16 |     elif command -v python3 &> /dev/null; then
17 |         PYTHON_CMD="python3"
18 |     else
19 |         echo "Python is not installed."
20 |         exit 1
21 |     fi
22 | }
23 | 
24 | 
25 | setup_environment() {
26 |     if [ ! -d "$VENV_DIR" ]; then
27 |         "$PYTHON_CMD" -m venv "$VENV_DIR"
28 |         echo "Virtual environment '$VENV_DIR' created."
29 |         # shellcheck disable=SC1091
30 |         source "$VENV_DIR/bin/activate"
31 |         pip install --upgrade pip > /dev/null
32 | 
33 |         # install everything
34 |         pip install 'litgpt[all] @ git+https://github.com/Lightning-AI/litgpt'
35 |         pip install -r "$SCRIPT_DIR/requirements.txt" --no-cache-dir > /dev/null
36 |         echo "Successfully installed lit-gpt and it's dependencies"
37 |     else
38 |         # shellcheck disable=SC1091
39 |         source "$VENV_DIR/bin/activate"
40 |     fi
41 | }
42 | 
43 | convert_hf_to_litgpt() {
44 |     local MODEL_NAME="$1"
45 | 
46 |     # This trick is done because LitGPT expects specific folder name / checkpoint_dir
47 |     # Llama-2-7b-chat-hf or Mistral-7B-Instruct-v0.1
48 |     TEMP_DIR=""
49 |     LITGPT_DIR=""
50 |     BACK_TO_DIR=""
51 | 
52 |     if [ "$MODEL_NAME" = "llama" ]; then
53 |         TEMP_DIR="$CURRENT_DIR/models/Llama-2-7b-chat-hf"
54 |         LITGPT_DIR="$CURRENT_DIR/models/llama-2-7b-chat-litgpt"
55 |         BACK_TO_DIR="$CURRENT_DIR/models/llama-2-7b-chat-hf"
56 |     elif [ "$MODEL_NAME" = "mistral" ]; then
57 |         TEMP_DIR="$CURRENT_DIR/models/Mistral-7B-Instruct-v0.1"
58 |         LITGPT_DIR="$CURRENT_DIR/models/mistral-7b-v0.1-instruct-litgpt"
59 |         BACK_TO_DIR="$CURRENT_DIR/models/mistral-7b-v0.1-instruct-hf"
60 |     else
61 |         echo "Invalid MODEL_NAME. Supported values: 'llama', 'mistral'"
62 |         exit 1
63 |     fi
64 | 
65 |     if [ -d "$LITGPT_DIR" ]; then
66 |         echo "Already converted"
67 |         exit 0
68 |     else
69 |         mv "$BACK_TO_DIR" "$TEMP_DIR"
70 |         mkdir -p "$LITGPT_DIR"
71 |         litgpt convert to_litgpt --checkpoint_dir "$TEMP_DIR"
72 |         mv "$TEMP_DIR/model_config.yaml" "$TEMP_DIR/lit_model.pth" "$LITGPT_DIR/"
73 |         cp -r "$TEMP_DIR/tokenizer.model" "$TEMP_DIR/tokenizer_config.json" "$LITGPT_DIR/"
74 |         mv "$TEMP_DIR" "$BACK_TO_DIR"
75 |     fi
76 | }
77 | 
78 | 
79 | MODEL_NAME="$1"
80 | 
81 | check_python
82 | setup_environment
83 | convert_hf_to_litgpt "$MODEL_NAME"
84 | 


--------------------------------------------------------------------------------
/bench_onnxruntime/setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ################################################################################
 4 | # Script: setup.sh
 5 | # Description: Automates the setup of a virtual environment and installs project
 6 | # requirements.
 7 | ################################################################################
 8 | 
 9 | set -euo pipefail
10 | 
11 | # Main script starts here.
12 | 
13 | CURRENT_DIR="$(pwd)"
14 | 
15 | check_docker() {
16 |     if command -v docker &> /dev/null; then
17 |         return 0
18 |     else
19 |         return 1
20 |     fi
21 | }
22 | 
23 | 
24 | build_docker_image () {
25 |     # Check if the Docker image exists
26 |     if docker image inspect anindyadeep/onnxruntime:latest &> /dev/null; then
27 |         echo "Image 'anindyadeep/onnxruntime:latest' already exists."
28 |     else
29 |         docker pull anindyadeep/onnxruntime:latest
30 |     fi
31 | }
32 | 
33 | build_and_compile_model () {
34 |     echo "Running and building the model inside Docker..."
35 |     local MODEL_NAME="$1"
36 |     local PRECISION="$2"
37 |     local DEVICE="$3"
38 | 
39 |     # Set the default folder paths for HF and engines
40 |     LLAMA2_WEIGHTS_FOLDER="/mnt/models/llama-2-7b-chat"
41 |     MISTRAL_WEIGHTS_FOLDER="/mnt/models/mistral-7b-v0.1-instruct"
42 | 
43 |     if [ "$MODEL_NAME" = "llama" ]; then
44 |         HF_DIR="$LLAMA2_WEIGHTS_FOLDER-hf"
45 |         ENGINE_DIR="$LLAMA2_WEIGHTS_FOLDER-onnx-$PRECISION"
46 |         OUT_DIR="$CURRENT_DIR/models/llama-2-7b-chat-onnx-$PRECISION"
47 | 
48 |     elif [ "$MODEL_NAME" = "mistral" ]; then
49 |         HF_DIR="$MISTRAL_WEIGHTS_FOLDER-hf"
50 |         ENGINE_DIR="$MISTRAL_WEIGHTS_FOLDER-onnx-$PRECISION"
51 |         OUT_DIR="$CURRENT_DIR/models/mistral-7b-v0.1-instruct-onnx-$PRECISION"
52 |     else
53 |         echo "Invalid MODEL_NAME. Supported values: 'llama', 'mistral'"
54 |         exit 1
55 |     fi
56 | 
57 |     if [ "$PRECISION" = "float32" ]; then
58 |         ONNX_PRECISION="fp32"
59 |     elif [ "$PRECISION" = "float16" ]; then
60 |         ONNX_PRECISION="fp16"
61 |     else
62 |         echo "Supported precision: 'float32' and 'float16'"
63 |         exit 1
64 |     fi
65 | 
66 |     if [ ! -d "$OUT_DIR" ]; then
67 |         docker run --gpus all \
68 |             --ipc=host \
69 |             --ulimit memlock=-1 \
70 |             --ulimit stack=67108864 \
71 |             -v "$CURRENT_DIR"/models:/mnt/models \
72 |             anindyadeep/onnxruntime:latest \
73 |             optimum-cli export onnx --model "$HF_DIR" \
74 |                 --task text-generation --framework pt \
75 |                 --opset 17 --sequence_length 1024 \
76 |                 --batch_size 1 --device "$DEVICE" \
77 |                 --dtype "$ONNX_PRECISION" "$ENGINE_DIR"
78 |     else
79 |         echo "Engine file already exists"
80 |     fi
81 | 
82 | }
83 | 
84 | 
85 | MODEL_NAME="${1:-"llama"}"
86 | DEVICE="$2"
87 | 
88 | if check_docker; then
89 |     build_docker_image
90 |     build_and_compile_model "$MODEL_NAME" "float32" "$DEVICE"
91 |     build_and_compile_model "$MODEL_NAME" "float16" "$DEVICE"
92 | else
93 |     echo "Docker is not installed or not in the PATH"
94 | fi
95 | 


--------------------------------------------------------------------------------
/benchmark.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | set -euo pipefail
  3 | 
  4 | print_usage() {
  5 |     echo "Usage: $0 [OPTIONS]"
  6 |     echo "OPTIONS:"
  7 |     echo "  -p, --prompt        Prompt for benchmarks (default: 'Explain what is a transformer')"
  8 |     echo "  -r, --repetitions   Number of repetitions for benchmarks (default: 10)"
  9 |     echo "  -m, --max_tokens    Maximum number of tokens for benchmarks (default: 100)"
 10 |     echo "  -d, --device        Device for benchmarks (possible values: 'metal', 'gpu', and 'cpu', default: 'cpu')"
 11 |     echo "  -lf, --log_file     Logging file name."
 12 |     echo "  -md, --models_dir   Models directory."
 13 |     echo "  -h, --help          Show this help message"
 14 |     exit 1
 15 | }
 16 | 
 17 | 
 18 | download_models() {
 19 |     echo -e "\nDownloading models..."
 20 |     bash ./download.sh --models models.json --cache cache.log
 21 | }
 22 | 
 23 | check_jq() {
 24 |     if ! command -v jq &> /dev/null
 25 |     then
 26 |         echo -e "\njq is not installed."
 27 |         exit 1
 28 |     fi
 29 | }
 30 | 
 31 | # Parse command-line arguments
 32 | while [ "$#" -gt 0 ]; do
 33 |     case "$1" in
 34 |         -p|--prompt)
 35 |             PROMPT="$2"
 36 |             shift 2
 37 |             ;;
 38 |         -r|--repetitions)
 39 |             REPETITIONS="$2"
 40 |             shift 2
 41 |             ;;
 42 |         -m|--max_tokens)
 43 |             MAX_TOKENS="$2"
 44 |             shift 2
 45 |             ;;
 46 |         -d|--device)
 47 |             DEVICE="$2"
 48 |             case "$DEVICE" in
 49 |                 "cuda" | "metal" | "cpu")
 50 |                     ;;
 51 |                 *)
 52 |                     echo "Invalid value for --device. Please use 'cuda', 'gpu' or 'cpu'."
 53 |                     print_usage
 54 |                     ;;
 55 |             esac
 56 |             shift 2
 57 |             ;;
 58 |         -lf|--log_file)
 59 |             LOG_FILENAME="$2"
 60 |             shift 2
 61 |             ;;
 62 |         -md|--models_dir)
 63 |             MODELS_DIR="$2"
 64 |             shift 2
 65 |             ;;
 66 |         -h|--help)
 67 |             print_usage
 68 |             ;;
 69 |         *)
 70 |             echo "Unknown option: $1"
 71 |             print_usage
 72 |             ;;
 73 |     esac
 74 | done
 75 | 
 76 | check_jq
 77 | download_models
 78 | 
 79 | 
 80 | PROMPT="${PROMPT:-"Explain what is a transformer"}"
 81 | REPETITIONS="${REPETITIONS:-10}"
 82 | MAX_TOKENS="${MAX_TOKENS:-100}"
 83 | DEVICE="${DEVICE:-'cpu'}"
 84 | LOG_FILENAME="${LOG_FILENAME:-"benchmark_$(date +'%Y%m%d%H%M%S').log"}"
 85 | MODELS_DIR="${MODELS_DIR:-"./models"}"
 86 | 
 87 | folders=$(find . -type d -name "bench_*")
 88 | 
 89 | for folder in $folders; do
 90 |     if [ -d "$folder" ]; then
 91 |         echo "Running benchmark $folder/bench.sh..."
 92 | 
 93 |         if ! bash "$folder/bench.sh" \
 94 |             --prompt "$PROMPT" \
 95 |             --repetitions "$REPETITIONS" \
 96 |             --max_tokens "$MAX_TOKENS" \
 97 |             --models_dir "$MODELS_DIR" \
 98 |             --log_file "$LOG_FILENAME" \
 99 |             --device "$DEVICE"; then
100 |             echo "Error: Something went wrong in $folder/bench.sh"
101 |         else
102 |             echo "Success: $folder/bench.sh completed successfully"
103 |         fi
104 |     fi
105 | done
106 | 


--------------------------------------------------------------------------------
/bench_autoawq/setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ################################################################################
 4 | # Script: setup.sh
 5 | # Description: Automates the setup of a virtual environment and installs project
 6 | # requirements.
 7 | ################################################################################
 8 | 
 9 | set -euo pipefail
10 | 
11 | # Main script starts here.
12 | CURRENT_DIR="$(pwd)"
13 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
14 | VENV_DIR="$SCRIPT_DIR/venv"
15 | 
16 | # Set default folder paths for AWQ weights
17 | LLAMA2_AWQ_WEIGHTS_FOLDER="$CURRENT_DIR/models/llama-2-7b-chat-autoawq"
18 | MISTRAL_AWQ_WEIGHTS_FOLDER="$CURRENT_DIR/models/mistral-7b-v0.1-instruct-autoawq"
19 | 
20 | check_python() {
21 |     if command -v python &> /dev/null; then
22 |         PYTHON_CMD="python"
23 |     elif command -v python3 &> /dev/null; then
24 |         PYTHON_CMD="python3"
25 |     else
26 |         echo "Python is not installed."
27 |         exit 1
28 |     fi
29 | }
30 | 
31 | download_awq_weights() {
32 |     local MODEL_NAME="$1"
33 | 
34 |     # Set download directory based on MODEL_NAME
35 |     if [ "$MODEL_NAME" = "llama" ]; then
36 |         DOWNLOAD_DIR="$LLAMA2_AWQ_WEIGHTS_FOLDER"
37 |         MODEL_IDENTIFIER="TheBloke/Llama-2-7B-Chat-AWQ"
38 |     elif [ "$MODEL_NAME" = "mistral" ]; then
39 |         DOWNLOAD_DIR="$MISTRAL_AWQ_WEIGHTS_FOLDER"
40 |         MODEL_IDENTIFIER="TheBloke/Mistral-7B-Instruct-v0.1-AWQ"
41 |     else
42 |         echo "Invalid MODEL_NAME. Supported values: 'llama', 'mistral'"
43 |         exit 1
44 |     fi
45 | 
46 |     # Check if weights folder exists
47 |     echo "$DOWNLOAD_DIR"
48 | 
49 |     if [ ! -d "$DOWNLOAD_DIR" ]; then
50 |         # Download weights using huggingface-cli
51 |         echo "Downloading weights to $DOWNLOAD_DIR..."
52 |         huggingface-cli download "$MODEL_IDENTIFIER" --local-dir "$DOWNLOAD_DIR" --exclude "*.git*" "*.md" "Notice" "LICENSE"
53 |     else
54 |         echo "Weights already downloaded"
55 |     fi
56 | }
57 | 
58 | check_python
59 | 
60 | if [ ! -d "$VENV_DIR" ]; then
61 |     "$PYTHON_CMD" -m venv "$VENV_DIR"
62 |     echo "Virtual environment '$VENV_DIR' created."
63 | 
64 |     # Activate virtual environment using specified activation scripts
65 |     if [ -f "$VENV_DIR/bin/activate" ]; then
66 |         # shellcheck disable=SC1091
67 |         source "$VENV_DIR/bin/activate"
68 |     elif [ -f "$VENV_DIR/Scripts/activate" ]; then
69 |         # shellcheck disable=SC1091
70 |         source "$VENV_DIR/Scripts/activate"
71 |     else
72 |         echo "Error: Unable to find virtual environment activation script."
73 |         exit 1
74 |     fi
75 | 
76 |     "$PYTHON_CMD" -m pip install --upgrade pip > /dev/null
77 |     "$PYTHON_CMD" -m pip install -r "$SCRIPT_DIR/requirements.txt" --no-cache-dir > /dev/null
78 | else
79 |     # Activate virtual environment using specified activation scripts
80 |     if [ -f "$VENV_DIR/bin/activate" ]; then
81 |         # shellcheck disable=SC1091
82 |         source "$VENV_DIR/bin/activate"
83 |     elif [ -f "$VENV_DIR/Scripts/activate" ]; then
84 |         # shellcheck disable=SC1091
85 |         source "$VENV_DIR/Scripts/activate"
86 |     else
87 |         echo "Error: Unable to find virtual environment activation script."
88 |         exit 1
89 |     fi
90 | fi
91 | 
92 | 
93 | MODEL_NAME="${1:-"llama"}"  # Use the first argument as MODEL_NAME if provided
94 | download_awq_weights "$MODEL_NAME"
95 | 


--------------------------------------------------------------------------------
/bench_exllamav2/setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ################################################################################
 4 | # Script: setup.sh <DEVICE>
 5 | # Description: Automates the setup of a virtual environment and installs project
 6 | # requirements.
 7 | ################################################################################
 8 | 
 9 | # Define directory paths
10 | CURRENT_DIR="$(pwd)"
11 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
12 | VENV_DIR="$SCRIPT_DIR/venv"
13 | 
14 | # Make the default dirs
15 | LLAMA2_EXLLAMA_WEIGHTS_FOLDER="$CURRENT_DIR/models/llama-2-7b-chat-exllamav2"
16 | MISTRAL_EXLLAMA_WEIGHTS_FOLDER="$CURRENT_DIR/models/mistral-7b-v0.1-instruct-exllamav2"
17 | 
18 | check_python() {
19 |     if command -v python &> /dev/null; then
20 |         PYTHON_CMD="python"
21 |     elif command -v python3 &> /dev/null; then
22 |         PYTHON_CMD="python3"
23 |     else
24 |         echo "Python is not installed."
25 |         exit 1
26 |     fi
27 | }
28 | 
29 | 
30 | setup_exllamav2_and_quantize() {
31 |     local MODEL_NAME="$1"
32 |     local QUANTIZATION="$2"
33 | 
34 |     if [ "$MODEL_NAME" = "llama" ]; then
35 |         EXLLAMA_WEIGHTS_FOLDER="$LLAMA2_EXLLAMA_WEIGHTS_FOLDER-$QUANTIZATION-bit"
36 |         HF_WEIGHTS_FOLDER="$CURRENT_DIR/models/llama-2-7b-chat-hf"
37 |     elif [ "$MODEL_NAME" = "mistral" ]; then
38 |         EXLLAMA_WEIGHTS_FOLDER="$MISTRAL_EXLLAMA_WEIGHTS_FOLDER-$QUANTIZATION-bit"
39 |         HF_WEIGHTS_FOLDER="$CURRENT_DIR/models/mistral-7b-v0.1-instruct-hf"
40 |     else
41 |         echo "Invalid MODEL_NAME. Supported values: 'llama', 'mistral'"
42 |         exit 1
43 |     fi
44 | 
45 |     # do the conversion if the ExLlamaV2
46 |     if [ -d "$EXLLAMA_WEIGHTS_FOLDER" ] && [ "$(ls -A "$EXLLAMA_WEIGHTS_FOLDER")" ]; then
47 |         echo "EXLLAMA_WEIGHTS_FOLDER already exists and is not empty."
48 |     else
49 |         # clone the repo, if not exists
50 |         if [ -d "$SCRIPT_DIR/exllamav2" ]; then
51 |             echo "exllamav2 folder already exists."
52 |         else
53 |             git clone https://github.com/turboderp/exllamav2.git "$SCRIPT_DIR/exllamav2"
54 |         fi
55 | 
56 |         mkdir -p "$EXLLAMA_WEIGHTS_FOLDER"
57 |         echo "Going for conversion to exllamav2 format from .safetensors in $QUANTIZATION bit quantization."
58 |         "$PYTHON_CMD" "$SCRIPT_DIR/exllamav2/convert.py" \
59 |         -i "$HF_WEIGHTS_FOLDER" \
60 |         -o "$EXLLAMA_WEIGHTS_FOLDER" \
61 |         -cf "$EXLLAMA_WEIGHTS_FOLDER" \
62 |         -b "$QUANTIZATION"
63 | 
64 |         # once done sync with other folders
65 |         rm -rf "$EXLLAMA_WEIGHTS_FOLDER/out_tensor"
66 |         rsync -av --exclude='*.safetensors' --exclude='.*' --exclude='*.bin' "$HF_WEIGHTS_FOLDER" "$EXLLAMA_WEIGHTS_FOLDER"
67 |     fi
68 | 
69 |     # Delete ExllamaV2 repo
70 |     rm -rf "$SCRIPT_DIR/exllamav2"
71 | }
72 | 
73 | 
74 | check_python
75 | 
76 | # CLI Args
77 | MODEL_NAME="$1"
78 | 
79 | if [ ! -d "$VENV_DIR" ]; then
80 |     "$PYTHON_CMD" -m venv "$VENV_DIR"
81 |     echo "Virtual environment '$VENV_DIR' created."
82 |     # shellcheck disable=SC1091
83 |     source "$VENV_DIR/bin/activate"
84 |     pip install --upgrade pip > /dev/null
85 |     pip install -r "$SCRIPT_DIR/requirements.txt" --no-cache-dir > /dev/null
86 | else
87 |     # shellcheck disable=SC1091
88 |     source "$VENV_DIR/bin/activate"
89 | fi
90 | 
91 | echo "Converting HuggingFace Llama2 model pytorch .bin file to .safetensors format"
92 | 
93 | setup_exllamav2_and_quantize "$MODEL_NAME" 4.0
94 | setup_exllamav2_and_quantize "$MODEL_NAME" 8.0
95 | 


--------------------------------------------------------------------------------
/bench_deepspeed/bench.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | 
  4 | import mii
  5 | from transformers import AutoTokenizer
  6 | 
  7 | sys.path.append(os.getcwd())
  8 | 
  9 | from common.base import BaseBenchmarkClass  # noqa
 10 | from common.utils import launch_cli, make_report  # noqa
 11 | 
 12 | 
 13 | class DeepSpeedBenchmark(BaseBenchmarkClass):
 14 |     def __init__(
 15 |         self,
 16 |         model_path: str,
 17 |         model_name: str,
 18 |         benchmark_name: str,
 19 |         precision: str,
 20 |         device: str,
 21 |         experiment_name: str,
 22 |     ) -> None:
 23 |         super().__init__(
 24 |             model_path=model_path,
 25 |             model_name=model_name,
 26 |             benchmark_name=benchmark_name,
 27 |             precision=precision,
 28 |             device=device,
 29 |             experiment_name=experiment_name,
 30 |         )
 31 | 
 32 |         assert precision == "float16", ValueError(
 33 |             "Precision other than 'float16' is not supported in DeepSpeed"
 34 |         )
 35 |         assert device == "cuda", ValueError(
 36 |             "Supported device is only cuda for DeepSpeed"
 37 |         )
 38 | 
 39 |     def load_model_and_tokenizer(self):
 40 |         self.model = mii.pipeline(self.model_path)
 41 |         self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
 42 |         return self
 43 | 
 44 |     def preprocess(
 45 |         self, prompt: str, chat_mode: bool = True, for_benchmarks: bool = True
 46 |     ):
 47 |         if chat_mode:
 48 |             template = self.get_chat_template_with_instruction(
 49 |                 prompt=prompt, for_benchmarks=for_benchmarks
 50 |             )
 51 |             prompt = self.tokenizer.apply_chat_template(template, tokenize=False)
 52 | 
 53 |         tokenized_input = self.tokenizer.encode(text=prompt)
 54 | 
 55 |         return {
 56 |             "prompt": prompt,
 57 |             "input_tokens": tokenized_input,
 58 |             "tensor": None,
 59 |             "num_input_tokens": len(tokenized_input),
 60 |         }
 61 | 
 62 |     def run_model(self, inputs: dict, max_tokens: int, temperature: float) -> dict:
 63 |         prompt = inputs["prompt"]
 64 |         output = self.model(
 65 |             [prompt], max_new_tokens=max_tokens, temperature=temperature
 66 |         )[0].generated_text
 67 | 
 68 |         output_tokens = self.tokenizer.encode(text=output)
 69 |         return {
 70 |             "output_prompt": output,
 71 |             "output_tokens": output_tokens,
 72 |             "num_output_tokens": len(output_tokens),
 73 |         }
 74 | 
 75 |     def postprocess(self, output: dict) -> str:
 76 |         return output["output_prompt"]
 77 | 
 78 | 
 79 | if __name__ == "__main__":
 80 |     parser = launch_cli(description="DeepSpeed Benchmark.")
 81 |     args = parser.parse_args()
 82 | 
 83 |     model_folder = os.path.join(os.getcwd(), "models")
 84 |     model_name = (
 85 |         f"{args.model_name}-2-7b-chat-hf"
 86 |         if args.model_name == "llama"
 87 |         else f"{args.model_name}-7b-v0.1-instruct-hf"
 88 |     )
 89 | 
 90 |     runner_dict = {
 91 |         "cuda": [
 92 |             {
 93 |                 "precision": "float16",
 94 |                 "model_path": os.path.join(model_folder, model_name),
 95 |             }
 96 |         ]
 97 |     }
 98 | 
 99 |     make_report(
100 |         args=args,
101 |         benchmark_class=DeepSpeedBenchmark,
102 |         runner_dict=runner_dict,
103 |         benchmark_name="DeepSpeed",
104 |         is_bench_pytorch=False,
105 |     )
106 | 


--------------------------------------------------------------------------------
/download.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | ################################################################################
  4 | # Script: download.sh
  5 | # Description: Downloads files from a list of URLs specified in a JSON file.
  6 | # The JSON file should contain an array of objects, each with a 'url', 'file',
  7 | # and 'folder' property. The script checks if the file already exists before
  8 | # downloading it.
  9 | #
 10 | # Usage: ./download.sh --models <json_file> --cache <cache_file> --force-download
 11 | #
 12 | # Example:
 13 | # ./download.sh --models models.json --cache cache.log --force-download
 14 | ################################################################################
 15 | 
 16 | set -euo pipefail
 17 | 
 18 | # Default values
 19 | models_file="$(pwd)/models.json"
 20 | cache_file="$(pwd)/cache.log"
 21 | force_download=false
 22 | 
 23 | # Function to download a file
 24 | download_file() {
 25 |  local url=$1
 26 |  local file=$2
 27 |  local dir=$3
 28 | 
 29 |  # Create the directory if it does not exist
 30 |  mkdir -p "$dir"
 31 | 
 32 |  # Download the file
 33 |  wget -N "$url" -O "$dir/$file"
 34 |  echo "$url" >> "$cache_file"
 35 | }
 36 | 
 37 | # Function to unzip a file
 38 | unzip_file() {
 39 |  local file=$1
 40 |  local dir=$2
 41 | 
 42 |  # Unzip the file
 43 |  unzip -o "$file" -d "$dir"
 44 | 
 45 |  # Move the unzipped files to the parent directory
 46 |  find "$dir" -mindepth 2 -type f -exec mv {} "$dir" \;
 47 | }
 48 | 
 49 | # Function to remove a file
 50 | remove_file() {
 51 |  local file=$1
 52 | 
 53 |  # Remove the file
 54 |  rm "$file"
 55 | }
 56 | 
 57 | # Argument parsing
 58 | while [[ $# -gt 0 ]]; do
 59 |   key="$1"
 60 | 
 61 |   case $key in
 62 |     --models)
 63 |       models_file="$2"
 64 |       shift # past argument
 65 |       shift # past value
 66 |       ;;
 67 |     --cache)
 68 |       cache_file="$2"
 69 |       shift # past argument
 70 |       shift # past value
 71 |       ;;
 72 |     --force-download)
 73 |       force_download=true
 74 |       shift # past argument
 75 |       ;;
 76 |     *)
 77 |       echo "Unknown option: $1"
 78 |       exit 1
 79 |       ;;
 80 |   esac
 81 | done
 82 | 
 83 | # Check if the required arguments are provided
 84 | if [ -z "$models_file" ] || [ -z "$cache_file" ]; then
 85 |   echo "Usage: $0 --models <json_file> --cache <cache_file> [--force-download]"
 86 |   exit 1
 87 | fi
 88 | 
 89 | # Check if the JSON file exists
 90 | if [ ! -f "$models_file" ]; then
 91 |   echo "Error: JSON file '$models_file' does not exist."
 92 |   exit 1
 93 | fi
 94 | 
 95 | # Check if force download is enabled
 96 | if $force_download; then
 97 |   echo "Force download enabled. Removing all files in the models folder and cache file."
 98 |   rm -rf ./models/*
 99 |   rm "$cache_file"
100 | fi
101 | 
102 | # Read the JSON file
103 | json=$(cat "$models_file")
104 | 
105 | # Parse the JSON file and iterate over its elements
106 | echo "$json" | jq -r '.[] | @base64' | while read -r i; do
107 |  _jq() {
108 |      echo "${i}" | base64 --decode | jq -r "${1}"
109 |  }
110 | 
111 |  url=$(_jq '.url')
112 |  file=$(_jq '.file')
113 |  folder=$(_jq '.folder')
114 | 
115 |  # Check if the URL is in the log file
116 |  if ! grep -q "$url" "$cache_file"; then
117 |    if [[ $file == *.zip ]]; then
118 |      echo "Downloading and unzipping: $url to $folder"
119 |      download_file "$url" "$file" "$folder"
120 |      unzip_file "$folder/$file" "$folder"
121 |      echo "Removing: $folder/$file"
122 |      remove_file "$folder/$file"
123 |    else
124 |      echo "Downloading: $url to $folder/$file"
125 |      download_file "$url" "$file" "$folder"
126 |    fi
127 |  fi
128 | done
129 | 


--------------------------------------------------------------------------------
/bench_ctranslate/setup.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | ################################################################################
  4 | # Script: setup.sh <MODELS_FOLDER>
  5 | # Description: This script automates the setup of a virtual environment,
  6 | # installs project requirements, converts model.
  7 | ################################################################################
  8 | 
  9 | set -euo pipefail
 10 | 
 11 | CURRENT_DIR="$(pwd)"
 12 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 13 | VENV_DIR="$SCRIPT_DIR/venv"
 14 | 
 15 | check_python() {
 16 |     if command -v python &> /dev/null; then
 17 |         PYTHON_CMD="python"
 18 |     elif command -v python3 &> /dev/null; then
 19 |         PYTHON_CMD="python3"
 20 |     else
 21 |         echo "Python is not installed."
 22 |         exit 1
 23 |     fi
 24 | }
 25 | 
 26 | 
 27 | build_and_compile_model () {
 28 |     local MODEL_NAME="$1"
 29 |     local PRECISION="$2"
 30 | 
 31 |     valid_precisions=("float32" "float16" "int8")
 32 | 
 33 |     # shellcheck disable=SC2199
 34 |     # shellcheck disable=SC2076
 35 |     if [[ ! " ${valid_precisions[@]} " =~ " $PRECISION " ]]; then
 36 |         echo "Invalid PRECISION value. Supported values are ${valid_precisions[*]}."
 37 |         exit 1
 38 |     fi
 39 | 
 40 |     if [[ "$MODEL_NAME" == "llama" ]]; then
 41 |         local model_download_path="$CURRENT_DIR/models/llama-2-7b-chat-ctranslate2-$PRECISION"
 42 |         local model_to_convert="$CURRENT_DIR/models/llama-2-7b-chat-hf"
 43 | 
 44 |     elif [[ "$MODEL_NAME" == "mistral" ]]; then
 45 |         local model_download_path="$CURRENT_DIR/models/mistral-7b-v0.1-instruct-ctranslate2-$PRECISION"
 46 |         local model_to_convert="$CURRENT_DIR/models/mistral-7b-v0.1-instruct-hf"
 47 |     else
 48 |         echo "No such model is supported"
 49 |         exit 1
 50 |     fi
 51 | 
 52 | 
 53 |     if [ ! -d "$model_download_path" ]; then
 54 |         ct2-transformers-converter --model "$model_to_convert" --quantization "$PRECISION" --output_dir "$model_download_path" --copy_files tokenizer.model tokenizer_config.json tokenizer.json special_tokens_map.json --force
 55 |         echo "Model Build for model: $MODEL_NAME and precision: $PRECISION ran successfully"
 56 |     else
 57 |         echo "Download folder already exists"
 58 |     fi
 59 | 
 60 | }
 61 | 
 62 | 
 63 | build_and_compile_models() {
 64 |     local MODEL_NAME="$1"
 65 |     local PRECISIONS=("float32" "float16" "int8")
 66 | 
 67 |     for PRECISION in "${PRECISIONS[@]}"; do
 68 |         build_and_compile_model "$MODEL_NAME" "$PRECISION"
 69 |     done
 70 | }
 71 | 
 72 | 
 73 | MODEL_NAME="${1:-"llama"}"
 74 | 
 75 | check_python
 76 | 
 77 | if [ ! -d "$VENV_DIR" ]; then
 78 |     "$PYTHON_CMD" -m venv "$VENV_DIR"
 79 |     echo "Virtual environment '$VENV_DIR' created."
 80 | 
 81 |     # Activate virtual environment using specified activation scripts
 82 |     if [ -f "$VENV_DIR/bin/activate" ]; then
 83 |         # shellcheck disable=SC1091
 84 |         source "$VENV_DIR/bin/activate"
 85 |     else
 86 |         echo "Error: Unable to find virtual environment activation script."
 87 |         exit 1
 88 |     fi
 89 | 
 90 |     "$PYTHON_CMD" -m pip install --upgrade pip > /dev/null
 91 |     "$PYTHON_CMD" -m pip install -r "$SCRIPT_DIR/requirements.txt" --no-cache-dir > /dev/null
 92 | else
 93 |     # Activate virtual environment using specified activation scripts
 94 |     if [ -f "$VENV_DIR/bin/activate" ]; then
 95 |         # shellcheck disable=SC1091
 96 |         source "$VENV_DIR/bin/activate"
 97 |     else
 98 |         echo "Error: Unable to find virtual environment activation script."
 99 |         exit 1
100 |     fi
101 | fi
102 | 
103 | 
104 | build_and_compile_models "$MODEL_NAME"
105 | 


--------------------------------------------------------------------------------
/bench_autogptq/setup.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | ################################################################################
  4 | # Script: setup.sh
  5 | # Description: Automates the setup of a virtual environment and installs project
  6 | # requirements.
  7 | ################################################################################
  8 | 
  9 | set -euo pipefail
 10 | 
 11 | # Main script starts here.
 12 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 13 | VENV_DIR="$SCRIPT_DIR/venv"
 14 | CURRENT_DIR="$(pwd)"
 15 | 
 16 | # Set default folder paths for GPTQ weights
 17 | LLAMA2_GPTQ_WEIGHTS_FOLDER="$CURRENT_DIR/models/llama-2-7b-chat-autogptq"
 18 | MISTRAL_GPTQ_WEIGHTS_FOLDER="$CURRENT_DIR/models/mistral-7b-v0.1-instruct-autogptq"
 19 | 
 20 | 
 21 | check_python() {
 22 |     if command -v python &> /dev/null; then
 23 |         PYTHON_CMD="python"
 24 |     elif command -v python3 &> /dev/null; then
 25 |         PYTHON_CMD="python3"
 26 |     else
 27 |         echo "Python is not installed."
 28 |         exit 1
 29 |     fi
 30 | }
 31 | 
 32 | download_gptq_weights() {
 33 |     local MODEL_NAME="$1"
 34 | 
 35 |     # Set download directory based on MODEL_NAME
 36 |     if [ "$MODEL_NAME" = "llama" ]; then
 37 |         DOWNLOAD_DIR="$LLAMA2_GPTQ_WEIGHTS_FOLDER"
 38 |         MODEL_IDENTIFIER="TheBloke/Llama-2-7B-Chat-GPTQ"
 39 |     elif [ "$MODEL_NAME" = "mistral" ]; then
 40 |         DOWNLOAD_DIR="$MISTRAL_GPTQ_WEIGHTS_FOLDER"
 41 |         MODEL_IDENTIFIER="TheBloke/Mistral-7B-Instruct-v0.1-GPTQ"
 42 |     else
 43 |         echo "Invalid MODEL_NAME. Supported values: 'llama', 'mistral'"
 44 |         exit 1
 45 |     fi
 46 | 
 47 |     # Check if weights folder exists
 48 |     echo "$DOWNLOAD_DIR"
 49 | 
 50 |     if [ ! -d "$DOWNLOAD_DIR" ]; then
 51 |         # Download weights using huggingface-cli
 52 |         echo "Downloading weights to $DOWNLOAD_DIR..."
 53 |         huggingface-cli download "$MODEL_IDENTIFIER" --local-dir "$DOWNLOAD_DIR" --exclude "*.git*" "*.md" "Notice" "LICENSE"
 54 |     else
 55 |         echo "Weights already downloaded"
 56 |     fi
 57 | }
 58 | 
 59 | install_autogptq() {
 60 |     if [ -d "$SCRIPT_DIR/AutoGPTQ" ]; then
 61 |         echo "Removing existing AutoGPTQ directory..."
 62 |         rm -rf "$SCRIPT_DIR"/AutoGPTQ
 63 |     fi
 64 | 
 65 |     git clone https://github.com/PanQiWei/AutoGPTQ.git "$SCRIPT_DIR"/AutoGPTQ
 66 |     cd "$SCRIPT_DIR"/AutoGPTQ
 67 | 
 68 |     # Now build
 69 | 
 70 |     "$PYTHON_CMD" setup.py install
 71 | 
 72 |     # come out of the dir
 73 |     cd "$SCRIPT_DIR"
 74 | }
 75 | 
 76 | check_python
 77 | 
 78 | if [ ! -d "$VENV_DIR" ]; then
 79 |     "$PYTHON_CMD" -m venv "$VENV_DIR"
 80 |     echo "Virtual environment '$VENV_DIR' created."
 81 | 
 82 |     if [ -f "$VENV_DIR/bin/activate" ]; then
 83 |         # shellcheck disable=SC1091
 84 |         source "$VENV_DIR/bin/activate"
 85 |     else
 86 |         echo "Error: Unable to find virtual environment activation script."
 87 |         exit 1
 88 |     fi
 89 | 
 90 |     "$PYTHON_CMD" -m pip install --upgrade pip > /dev/null
 91 |     "$PYTHON_CMD" -m pip install -r "$SCRIPT_DIR/requirements.txt" --no-cache-dir > /dev/null
 92 | 
 93 |     "$PYTHON_CMD" -m pip uninstall -y fsspec
 94 | 
 95 |     # Install the required version of fsspec
 96 |     "$PYTHON_CMD" -m pip install 'fsspec[http]>=2023.1.0,<=2024.2.0'
 97 | 
 98 |     install_autogptq
 99 | else
100 |     if [ -f "$VENV_DIR/bin/activate" ]; then
101 |         # shellcheck disable=SC1091
102 |         source "$VENV_DIR/bin/activate"
103 |     else
104 |         echo "Error: Unable to find virtual environment activation script."
105 |         exit 1
106 |     fi
107 | fi
108 | 
109 | 
110 | MODEL_NAME="${1:-"llama"}"  # Use the first argument as MODEL_NAME if provided
111 | download_gptq_weights "$MODEL_NAME"
112 | 


--------------------------------------------------------------------------------
/bench_lightning/bench.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | from pathlib import Path
  4 | 
  5 | from transformers import AutoTokenizer
  6 | 
  7 | sys.path.append(os.getcwd())
  8 | 
  9 | from bench_lightning.inference import generate, load_model  # noqa
 10 | from common.base import BaseBenchmarkClass  # noqa
 11 | from common.utils import launch_cli, make_report  # noqa
 12 | 
 13 | 
 14 | class PyTorchLightningBenchmark(BaseBenchmarkClass):
 15 |     def __init__(
 16 |         self,
 17 |         model_path: str,
 18 |         model_name: str,
 19 |         benchmark_name: str,
 20 |         precision: str,
 21 |         device: str,
 22 |         experiment_name: str,
 23 |     ) -> None:
 24 |         super().__init__(
 25 |             model_name=model_name,
 26 |             model_path=model_path,
 27 |             benchmark_name=benchmark_name,
 28 |             experiment_name=experiment_name,
 29 |             precision=precision,
 30 |             device=device,
 31 |         )
 32 | 
 33 |         self.quantization_precision_mapping = {
 34 |             "float16": {"precision": "16-true", "quantize": None},
 35 |             "float32": {"precision": "32-true", "quantize": None},
 36 |             "int8": {"precision": "16-true", "quantize": "bnb.int8"},
 37 |         }
 38 | 
 39 |         if model_name == "llama":
 40 |             self.tokenizer_folder = os.path.join(
 41 |                 os.getcwd(), "models", "llama-2-7b-chat-hf"
 42 |             )
 43 |         else:
 44 |             self.tokenizer_folder = os.path.join(
 45 |                 os.getcwd(), "models", "mistral-7b-v0.1-instruct-hf"
 46 |             )
 47 | 
 48 |     def load_model_and_tokenizer(self):
 49 |         self.model, self.lit_tokenizer, self.prompt_style, self.fabric = load_model(
 50 |             checkpoint_dir=self.model_path,
 51 |             quantize=self.quantization_precision_mapping[self.precision]["quantize"],
 52 |             precision=self.quantization_precision_mapping[self.precision]["precision"],
 53 |         )
 54 | 
 55 |         self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_folder)
 56 |         return self
 57 | 
 58 |     def preprocess(
 59 |         self, prompt: str, chat_mode: bool = True, for_benchmarks: bool = True
 60 |     ):
 61 |         return {"prompt": prompt}
 62 | 
 63 |     def run_model(self, inputs: dict, max_tokens: int, temperature: float) -> dict:
 64 |         prompt = inputs["prompt"]
 65 |         output = generate(
 66 |             model=self.model,
 67 |             tokenizer=self.lit_tokenizer,
 68 |             prompt_style=self.prompt_style,
 69 |             fabric=self.fabric,
 70 |             prompt=prompt,
 71 |             max_new_tokens=max_tokens,
 72 |             temperature=temperature,
 73 |         )
 74 | 
 75 |         output_prompt = self.tokenizer.decode(
 76 |             output["output_tokens"], skip_special_tokens=True
 77 |         )
 78 |         return {**output, "output_prompt": output_prompt}
 79 | 
 80 |     def postprocess(self, output: dict) -> str:
 81 |         return output["output_prompt"]
 82 | 
 83 | 
 84 | if __name__ == "__main__":
 85 |     parser = launch_cli(description="PyTorch Lightning")
 86 |     args = parser.parse_args()
 87 | 
 88 |     model_folder = os.path.join(os.getcwd(), "models")
 89 |     model_name = (
 90 |         f"{args.model_name}-2-7b-chat-litgpt"
 91 |         if args.model_name == "llama"
 92 |         else f"{args.model_name}-7b-v0.1-instruct-litgpt"
 93 |     )
 94 | 
 95 |     model_path = Path(os.path.join(model_folder, model_name))
 96 | 
 97 |     runner_dict = {
 98 |         "cuda": [
 99 |             {"precision": "float16", "model_path": model_path},
100 |             {"precision": "float32", "model_path": model_path},
101 |             {"precision": "int8", "model_path": model_path},
102 |         ]
103 |     }
104 | 
105 |     make_report(
106 |         args=args,
107 |         benchmark_class=PyTorchLightningBenchmark,
108 |         runner_dict=runner_dict,
109 |         benchmark_name="PyTorch Lightning",
110 |         is_bench_pytorch=False,
111 |     )
112 | 


--------------------------------------------------------------------------------
/questions.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "prompt": "I'm making pancakes for breakfast. I added a cup of flour, a teaspoon of salt, and a few tablespoons of sugar to a bowl. I stirred it together, then added a cup of milk, a beaten egg, and a few tablespoons of oil, and stirred until just mixed. Then I put 1/4 a cup on a hot frying pan, and flipped it when brown. But they're terrible! Why? List the main reason. Answer as much precise as possible with one sentence.",
 4 |         "max_tokens": 120,
 5 |         "temperature": 0.1,
 6 |         "expected": {
 7 |             "llama": " The main reason your pancakes are terrible is because you did not mix the batter long enough, resulting in a dense and flat pancake.",
 8 |             "mistral": "he main reason for the terrible pancakes could be that the batter was too thick, which resulted in a dry and tough texture."
 9 |         }
10 |     },
11 |     {
12 |         "prompt": "42 birds are sitting on a tree branch. A hunter passes, shoots one dead, and misses two. How many birds are left on the branch? Answer as much precise as possible with one sentence.",
13 |         "max_tokens": 120,
14 |         "temperature": 0.1,
15 |         "expected": {
16 |             "llama": " After the hunter shoots and kills one bird, 41 birds remain on the branch.",
17 |             "mistral": "One bird is left on the branch."
18 |         }
19 |     },
20 |     {
21 |         "prompt": "How many mistakes can you find in the following sentence? Sentence: Its to much to loose if your talking about hundred’s of dollars. Answer as much precise as possible with one sentence.",
22 |         "max_tokens": 120,
23 |         "temperature": 0.1,
24 |         "expected": {
25 |             "llama": " There are 2 mistakes in the sentence: \"to loose\" should be \"to lose\" and \"hundred's\" should be \"hundreds\".",
26 |             "mistral": "There are two mistakes in the sentence: \"Its\" should be \"It's\" and \"to much\" should be \"too much\"."
27 |         }
28 |     },
29 |     {
30 |         "prompt": "You are an expert on agricultural chemical compounds. Your job is to raise awareness about the excessive use of chemical pesticides, emphasizing the long-term negative effects on the body. Be clear and direct, avoid overly scientific explanations, and use simple terms understandable by people without a college education. Answer as much precise as possible with one sentence.",
31 |         "max_tokens": 120,
32 |         "temperature": 0.1,
33 |         "expected": {
34 |             "llama": " Excessive use of chemical pesticides can have long-term negative effects on the body, including cancer, hormone disruption, and neurological problems, due to their toxic and persistent nature, which can accumulate in the body over time and cause irreversible damage. It is important to adopt sustainable and organic farming practices that prioritize the health of both people and the environment, such as crop rotation, composting, and integrated pest management, to minimize the use of harmful chemicals and promote a healthier",
35 |             "mistral": "Excessive use of chemical pesticides can lead to long-term negative health effects, including cancer, neurological damage, and reproductive issues."
36 |         }
37 |     },
38 |     {
39 |         "prompt": "During the Manhattan Project, John von Neumann suggested how to maximize the destruction caused by a detonating bomb. Which was his idea? Answer as much precise as possible with one sentence.",
40 |         "max_tokens": 120,
41 |         "temperature": 0.1,
42 |         "expected": {
43 |             "llama": " John von Neumann proposed the concept of a \"fusion bomb,\" which would achieve maximum destruction through the simultaneous detonation of a fission bomb and a fusion reaction, creating a chain reaction of nuclear reactions that would release an enormous amount of energy. This idea was later developed into the hydrogen bomb.",
44 |             "mistral": "John von Neumann proposed the concept of a \"superbomb\" that would use multiple nuclear explosions to create a massive explosion, causing widespread destruction."
45 |         }
46 |     }
47 | ]
48 | 


--------------------------------------------------------------------------------
/bench_llamacpp/bench.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | 
  4 | from llama_cpp import Llama
  5 | from transformers import AutoTokenizer
  6 | 
  7 | sys.path.append(os.getcwd())
  8 | 
  9 | from common.base import BaseBenchmarkClass  # noqa
 10 | from common.utils import launch_cli, make_report  # noqa
 11 | 
 12 | 
 13 | class LlamaCPPBenchmark(BaseBenchmarkClass):
 14 |     def __init__(
 15 |         self,
 16 |         model_path: str,
 17 |         model_name: str,
 18 |         benchmark_name: str,
 19 |         precision: str,
 20 |         device: str,
 21 |         experiment_name: str,
 22 |     ) -> None:
 23 |         assert precision in ["int8", "int4"], ValueError(
 24 |             "Precision should set either 'int8' or 'int4'"
 25 |         )
 26 |         super().__init__(
 27 |             model_name=model_name,
 28 |             model_path=model_path,
 29 |             benchmark_name=benchmark_name,
 30 |             experiment_name=experiment_name,
 31 |             precision=precision,
 32 |             device=device,
 33 |         )
 34 | 
 35 |         if model_name == "llama":
 36 |             self.tokenizer_folder = os.path.join(
 37 |                 os.getcwd(), "models", "llama-2-7b-chat-hf"
 38 |             )
 39 |         else:
 40 |             self.tokenizer_folder = os.path.join(
 41 |                 os.getcwd(), "models", "mistral-7b-v0.1-instruct-hf"
 42 |             )
 43 | 
 44 |     def load_model_and_tokenizer(self):
 45 |         self.model = Llama(
 46 |             model_path=self.model_path,
 47 |             n_gpu_layers=0 if self.device == "cpu" else -1,
 48 |             verbose=True,
 49 |         )
 50 |         self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_folder)
 51 |         return self
 52 | 
 53 |     def preprocess(
 54 |         self, prompt: str, chat_mode: bool = True, for_benchmarks: bool = True
 55 |     ):
 56 |         if chat_mode:
 57 |             template = self.get_chat_template_with_instruction(
 58 |                 prompt=prompt, for_benchmarks=for_benchmarks
 59 |             )
 60 |             prompt = self.tokenizer.apply_chat_template(template, tokenize=False)
 61 | 
 62 |         tokenized_input = self.tokenizer.encode(text=prompt)
 63 |         return {
 64 |             "prompt": prompt,
 65 |             "input_tokens": tokenized_input,
 66 |             "tensor": None,
 67 |             "num_input_tokens": len(tokenized_input),
 68 |         }
 69 | 
 70 |     def run_model(
 71 |         self, inputs: dict, max_tokens: int, temperature: float = 0.1
 72 |     ) -> dict:
 73 |         prompt = inputs["prompt"]
 74 |         output = self.model.create_completion(
 75 |             prompt, max_tokens=max_tokens, temperature=temperature
 76 |         )
 77 | 
 78 |         output_prompt = output["choices"][0]["text"]
 79 |         num_tokens = output["usage"]["completion_tokens"]
 80 |         return {"output_prompt": output_prompt, "num_output_tokens": num_tokens}
 81 | 
 82 |     def postprocess(self, output: dict) -> str:
 83 |         return output["output_prompt"]
 84 | 
 85 | 
 86 | if __name__ == "__main__":
 87 |     parser = launch_cli(description="LlamaCPP Benchmark.")
 88 |     args = parser.parse_args()
 89 | 
 90 |     model_folder = os.path.join(os.getcwd(), "models")
 91 |     model_name = (
 92 |         f"{args.model_name}-2-7b-chat-gguf/llama-2-7b-chat."
 93 |         if args.model_name == "llama"
 94 |         else f"{args.model_name}-7b-v0.1-instruct-gguf/mistral-7b-instruct-v0.1."
 95 |     )
 96 | 
 97 |     runner_dict = {
 98 |         "cuda": [
 99 |             {
100 |                 "precision": "int4",
101 |                 "model_path": os.path.join(model_folder, model_name + "Q4_K_M.gguf"),
102 |             },
103 |             {
104 |                 "precision": "int8",
105 |                 "model_path": os.path.join(model_folder, model_name + "Q8_0.gguf"),
106 |             },
107 |         ]
108 |     }
109 | 
110 |     make_report(
111 |         args=args,
112 |         benchmark_class=LlamaCPPBenchmark,
113 |         runner_dict=runner_dict,
114 |         benchmark_name="LlamaCPP",
115 |         is_bench_pytorch=False,
116 |     )
117 | 


--------------------------------------------------------------------------------
/docs/archive.md:
--------------------------------------------------------------------------------
 1 | # ⚙️ Benchmarking ML Engines
 2 | 
 3 | This file contains numbers for different engines and precision. Since a lot of upgrades in models and engines were made. So these
 4 | results are now archived. However latest implementation does not have benchmarks for Metal or Mac CPU. So if you want to see that, feel free to check those out here.
 5 | 
 6 | ## A100 80GB Inference Bench:
 7 | 
 8 | **Environment:**
 9 | - Model: LLAMA-2-7B
10 | - CUDA Version: 11.7
11 | - Command: `./benchmark.sh --repetitions 10 --max_tokens 512 --device cuda --prompt 'Write an essay about the transformer model architecture'`
12 | 
13 | **Performance Metrics:** (unit: Tokens / second)
14 | 
15 | | Engine                                     | float32       | float16       | int8          | int4           |
16 | | ------------------------------------------ | ------------- | ------------- | ------------- | -------------- |
17 | | [candle](/bench_candle/)                   | -             | 36.78 ± 2.17  | -             | -              |
18 | | [llama.cpp](/bench_llamacpp/)              | -             | -             | 79.15 ± 1.20  | 100.90 ± 1.46  |
19 | | [ctranslate](/bench_ctranslate/)           | 35.23 ± 4.01  | 55.72 ± 16.66 | 35.73 ± 10.87 | -              |
20 | | [onnx](/bench_onnxruntime/)                | -             | 54.16 ± 3.15  | -             | -              |
21 | | [transformers (pytorch)](/bench_pytorch/)  | 43.79 ± 0.61  | 46.39 ± 0.28  | 6.98 ± 0.05   | 21.72 ± 0.11   |
22 | | [vllm](/bench_vllm/)                       | 90.78 ± 1.60  | 90.54 ± 2.22  | -             | 114.69 ± 11.20 |
23 | | [exllamav2](/bench_exllamav2/)             | -             | -             | 121.63 ± 0.74 | 130.16 ± 0.35  |
24 | | [ctransformers](/bench_ctransformers/)     | -             | -             | 76.75 ± 10.36 | 84.26 ± 5.79   |
25 | | [AutoGPTQ](/bench_autogptq/)               | 42.01 ± 1.03  | 30.24 ± 0.41  | -             | -              |
26 | | [AutoAWQ](/bench_autoawq/)                 | -             | -             | -             | 109.20 ± 3.28  |
27 | | [DeepSpeed](/bench_deepspeed/)             | -             | 81.44 ± 8.13  | -             |                |
28 | | [PyTorch Lightning](/bench_lightning/)     | 24.85 ± 0.07  | 44.56 ± 2.89  | 10.50 ± 0.12  | 24.83 ± 0.05   |
29 | | [Optimum Nvidia](/bench_optimum_nvidia/)   | 110.36 ± 0.52 | 109.09 ± 4.26 | -             | -              |
30 | | [Nvidia TensorRT-LLM](/bench_tensorrtllm/) | 55.19 ± 1.03  | 85.03 ± 0.62  | 167.66 ± 2.05 | 235.18 ± 3.20  |
31 | 
32 | *(Data updated: `05th April 2024`)
33 | 
34 | 
35 | ## M2 MAX 32GB Inference Bench:
36 | 
37 | ### CPU
38 | 
39 | **Environment:**
40 | - Model: LLAMA-2-7B
41 | - CUDA Version: NA
42 | - Command: `./benchmark.sh --repetitions 10 --max_tokens 512 --device cpu --prompt 'Write an essay about the transformer model architecture'`
43 | 
44 | **Performance Metrics:** (unit: Tokens / second)
45 | | Engine                                 | float32 | float16     | int8         | int4         |
46 | | -------------------------------------- | ------- | ----------- | ------------ | ------------ |
47 | | [candle](/bench_candle/)               | -       | 3.43 ± 0.02 | -            | -            |
48 | | [llama.cpp](/bench_llamacpp/)          | -       | -           | 13.24 ± 0.62 | 21.43 ± 0.47 |
49 | | [ctranslate](/bench_ctranslate/)       | -       | -           | 1.87 ± 0.14  | -            |
50 | | [ctransformers](/bench_ctransformers/) | -       | -           | 13.50 ± 0.48 | 20.57 ± 2.50 |
51 | 
52 | 
53 | ### GPU (Metal)
54 | 
55 | **Command:** `./benchmark.sh --repetitions 10 --max_tokens 512 --device metal --prompt 'Write an essay about the transformer model architecture'`
56 | 
57 | **Performance Metrics:** (unit: Tokens / second)
58 | | Engine                                 | float32 | float16 | int8         | int4         |
59 | | -------------------------------------- | ------- | ------- | ------------ | ------------ |
60 | | [llama.cpp](/bench_llamacpp/)          | -       | -       | 30.11 ± 0.45 | 44.27 ± 0.12 |
61 | | [ctransformers](/bench_ctransformers/) | -       | -       | 20.75 ± 0.36 | 34.04 ± 2.11 |
62 | 
63 | *(Data updated: `05th April 2024`)
64 | 


--------------------------------------------------------------------------------
/bench_ctranslate/bench.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | 
  4 | import ctranslate2
  5 | from transformers import AutoTokenizer
  6 | 
  7 | # have to hard code this thing
  8 | sys.path.append(os.getcwd())
  9 | 
 10 | from common.base import BaseBenchmarkClass  # noqa
 11 | from common.utils import launch_cli, make_report  # noqa
 12 | 
 13 | 
 14 | class CTranslateBenchmark(BaseBenchmarkClass):
 15 |     def __init__(
 16 |         self,
 17 |         model_path: str,
 18 |         model_name: str,
 19 |         benchmark_name: str,
 20 |         precision: str,
 21 |         device: str,
 22 |         experiment_name: str,
 23 |     ) -> None:
 24 |         assert precision in ["float32", "float16", "int8"], ValueError(
 25 |             "Precision other than: 'float32', 'float16', 'int8' are not supported"
 26 |         )
 27 |         super().__init__(
 28 |             model_path=model_path,
 29 |             model_name=model_name,
 30 |             benchmark_name=benchmark_name,
 31 |             precision=precision,
 32 |             device=device,
 33 |             experiment_name=experiment_name,
 34 |         )
 35 | 
 36 |     def load_model_and_tokenizer(self):
 37 |         self.model = ctranslate2.Generator(self.model_path, device=self.device)
 38 | 
 39 |         self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
 40 |         return self
 41 | 
 42 |     def preprocess(self, prompt: str, chat_mode: bool = True, for_benchmarks=True):
 43 |         if chat_mode:
 44 |             template = self.get_chat_template_with_instruction(
 45 |                 prompt=prompt, for_benchmarks=for_benchmarks
 46 |             )
 47 |             prompt = self.tokenizer.apply_chat_template(template, tokenize=False)
 48 | 
 49 |         tokenized_input = self.tokenizer.convert_ids_to_tokens(
 50 |             self.tokenizer.encode(prompt)
 51 |         )
 52 |         return {
 53 |             "prompt": prompt,
 54 |             "input_tokens": tokenized_input,
 55 |             "tensor": None,
 56 |             "num_input_tokens": len(tokenized_input),
 57 |         }
 58 | 
 59 |     def run_model(
 60 |         self, inputs: dict, max_tokens: int, temperature: float = 0.1
 61 |     ) -> dict:
 62 |         tokenized_input = inputs["input_tokens"]
 63 |         num_input_tokens = inputs["num_input_tokens"] - 1
 64 | 
 65 |         output = self.model.generate_batch(
 66 |             [tokenized_input], max_length=max_tokens, sampling_temperature=0.1
 67 |         )
 68 | 
 69 |         output_tokens = output[0].sequences_ids[0][num_input_tokens:]
 70 |         output_prompt = self.tokenizer.decode(output_tokens, skip_special_tokens=True)
 71 |         return {
 72 |             "output_prompt": output_prompt,
 73 |             "output_tokens": output_tokens,
 74 |             "num_output_tokens": len(output_tokens),
 75 |         }
 76 | 
 77 |     def postprocess(self, output: dict) -> str:
 78 |         return output["output_prompt"]
 79 | 
 80 | 
 81 | if __name__ == "__main__":
 82 |     parser = launch_cli(description="CTransformers Benchmark.")
 83 |     args = parser.parse_args()
 84 | 
 85 |     model_folder = os.path.join(os.getcwd(), "models")
 86 |     model_name = (
 87 |         f"{args.model_name}-2-7b-chat-ctranslate2-"
 88 |         if args.model_name == "llama"
 89 |         else f"{args.model_name}-7b-v0.1-instruct-ctranslate2-"
 90 |     )
 91 | 
 92 |     runner_dict = {
 93 |         "cuda": [
 94 |             {
 95 |                 "precision": "float32",
 96 |                 "model_path": os.path.join(model_folder, model_name + "float32"),
 97 |             },
 98 |             {
 99 |                 "precision": "float16",
100 |                 "model_path": os.path.join(model_folder, model_name + "float16"),
101 |             },
102 |             {
103 |                 "precision": "int8",
104 |                 "model_path": os.path.join(model_folder, model_name + "int8"),
105 |             },
106 |         ]
107 |     }
108 | 
109 |     make_report(
110 |         args=args,
111 |         benchmark_class=CTranslateBenchmark,
112 |         runner_dict=runner_dict,
113 |         benchmark_name="CTranslate2",
114 |         is_bench_pytorch=False,
115 |     )
116 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 
162 | # don't check-in sub folder
163 | models/*
164 | !models/.gitkeep
165 | 
166 | # Repositories
167 | bench_tinygrad/tinygrad
168 | bench_burn/llama2-burn
169 | bench_exllamav2/exllamav2
170 | bench_exllamav2/wikitext-test.parquet
171 | bench_lightning/lit-gpt
172 | 


--------------------------------------------------------------------------------
/bench_autoawq/bench.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import sys
  4 | 
  5 | import torch
  6 | from awq import AutoAWQForCausalLM
  7 | from transformers import AutoTokenizer
  8 | 
  9 | sys.path.append(os.getcwd())
 10 | 
 11 | from common.base import BaseBenchmarkClass  # noqa
 12 | from common.utils import launch_cli, make_report  # noqa
 13 | 
 14 | 
 15 | class AutoAWQBenchmark(BaseBenchmarkClass):
 16 |     def __init__(
 17 |         self,
 18 |         model_path: str,
 19 |         model_name: str,
 20 |         benchmark_name: str,
 21 |         precision: str,
 22 |         device: str,
 23 |         experiment_name: str,
 24 |     ) -> None:
 25 |         super().__init__(
 26 |             model_name=model_name,
 27 |             model_path=model_path,
 28 |             benchmark_name=benchmark_name,
 29 |             experiment_name=experiment_name,
 30 |             precision=precision,
 31 |             device=device,
 32 |         )
 33 | 
 34 |         # Have to do this step
 35 |         # since tokenizer in autoawq is not the instruction tuned one for the instruction tuned model
 36 | 
 37 |         if model_name == "llama":
 38 |             self.tokenizer_folder = os.path.join(
 39 |                 os.getcwd(), "models", "llama-2-7b-chat-hf"
 40 |             )
 41 |         else:
 42 |             self.tokenizer_folder = os.path.join(
 43 |                 os.getcwd(), "models", "mistral-7b-v0.1-instruct-hf"
 44 |             )
 45 | 
 46 |     def load_model_and_tokenizer(self):
 47 |         self.model = AutoAWQForCausalLM.from_quantized(
 48 |             self.model_path, fuse_layers=True, safetensors=True, strict=False
 49 |         )
 50 | 
 51 |         self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_folder)
 52 |         return self
 53 | 
 54 |     def preprocess(self, prompt: str, chat_mode: bool = True, for_benchmarks=True):
 55 |         if chat_mode:
 56 |             template = self.get_chat_template_with_instruction(
 57 |                 prompt=prompt, for_benchmarks=for_benchmarks
 58 |             )
 59 |             prompt = self.tokenizer.apply_chat_template(template, tokenize=False)
 60 | 
 61 |         tokenized_input = self.tokenizer.encode(text=prompt)
 62 |         tensor = self.tokenizer.encode(prompt, return_tensors="pt").to(self.device)
 63 |         return {
 64 |             "prompt": prompt,
 65 |             "input_tokens": tokenized_input,
 66 |             "tensor": tensor,
 67 |             "num_input_tokens": len(tokenized_input),
 68 |         }
 69 | 
 70 |     def run_model(self, inputs: dict, max_tokens: int, temperature: float) -> dict:
 71 |         tensor = inputs["tensor"]
 72 |         num_input_tokens = inputs["num_input_tokens"]
 73 | 
 74 |         output = (
 75 |             self.model.generate(
 76 |                 input_ids=tensor,
 77 |                 max_new_tokens=max_tokens,
 78 |                 temperature=temperature,
 79 |                 do_sample=True,
 80 |             )
 81 |             .detach()
 82 |             .tolist()[0]
 83 |         )
 84 | 
 85 |         output_tokens = (
 86 |             output[num_input_tokens:] if len(output) > num_input_tokens else output
 87 |         )
 88 | 
 89 |         return {"output_tokens": output_tokens, "num_output_tokens": len(output_tokens)}
 90 | 
 91 |     def postprocess(self, output: dict) -> str:
 92 |         output_tokens = output["output_tokens"]
 93 |         return self.tokenizer.decode(output_tokens, skip_special_tokens=True)
 94 | 
 95 |     def on_exit(self):
 96 |         if self.device == "cuda:0":
 97 |             del self.model
 98 |             torch.cuda.synchronize()
 99 |         else:
100 |             del self.model
101 | 
102 | 
103 | if __name__ == "__main__":
104 |     parser = launch_cli(description="AWQ Benchmark.")
105 |     args = parser.parse_args()
106 | 
107 |     model_folder = os.path.join(os.getcwd(), "models")
108 |     model_name = (
109 |         f"{args.model_name}-2-7b-chat-autoawq"
110 |         if args.model_name == "llama"
111 |         else f"{args.model_name}-7b-v0.1-instruct-autoawq"
112 |     )
113 | 
114 |     runner_dict = {
115 |         "cuda": [
116 |             {"precision": "int4", "model_path": os.path.join(model_folder, model_name)}
117 |         ]
118 |     }
119 | 
120 |     if args.device == "cpu":
121 |         logging.info("Skipping running model on int4 on CPU, not implemented for Half")
122 |         pass
123 |     else:
124 |         make_report(
125 |             args=args,
126 |             benchmark_class=AutoAWQBenchmark,
127 |             runner_dict=runner_dict,
128 |             benchmark_name="AutoAWQ",
129 |             is_bench_pytorch=False,
130 |         )
131 | 


--------------------------------------------------------------------------------
/bench_candle/convert_to_safetensors.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import logging
  3 | import os
  4 | import shutil
  5 | from collections import defaultdict
  6 | 
  7 | import torch
  8 | from safetensors.torch import load_file, save_file
  9 | 
 10 | 
 11 | def setup_logging():
 12 |     logging.basicConfig(
 13 |         level=logging.INFO,
 14 |         format="%(asctime)s [%(levelname)s]: %(message)s",
 15 |         handlers=[logging.StreamHandler()],
 16 |     )
 17 | 
 18 | 
 19 | def shared_pointers(tensors):
 20 |     ptrs = defaultdict(list)
 21 |     for k, v in tensors.items():
 22 |         ptrs[v.data_ptr()].append(k)
 23 |     failing = []
 24 |     for _, names in ptrs.items():
 25 |         if len(names) > 1:
 26 |             failing.append(names)
 27 |     return failing
 28 | 
 29 | 
 30 | def check_file_size(sf_filename: str, pt_filename: str):
 31 |     sf_size = os.stat(sf_filename).st_size
 32 |     pt_size = os.stat(pt_filename).st_size
 33 | 
 34 |     if (sf_size - pt_size) / pt_size > 0.01:
 35 |         raise RuntimeError(
 36 |             f"The file size different is more than 1%:\n - {sf_filename}: {sf_size}\n - {pt_filename}: {pt_size}"
 37 |         )
 38 | 
 39 | 
 40 | def rename(pt_filename: str) -> str:
 41 |     filename, _ = os.path.splitext(pt_filename)
 42 |     local = f"{filename}.safetensors"
 43 |     local = local.replace("pytorch_model", "model")
 44 |     return local
 45 | 
 46 | 
 47 | def copy_file(src: str, dest: str):
 48 |     try:
 49 |         shutil.copy(src, dest)
 50 |         logging.info(f"Copying {src} to {dest}")
 51 |     except FileNotFoundError:
 52 |         logging.warning(f"{src} not found. Skipping copy.")
 53 | 
 54 | 
 55 | def convert_file(pt_filename: str, sf_filename: str):
 56 |     loaded = torch.load(pt_filename, map_location="cpu")
 57 |     if "state_dict" in loaded:
 58 |         loaded = loaded["state_dict"]
 59 |     shared = shared_pointers(loaded)
 60 |     for shared_weights in shared:
 61 |         for name in shared_weights[1:]:
 62 |             loaded.pop(name)
 63 | 
 64 |     # For tensors to be contiguous
 65 |     loaded = {k: v.contiguous() for k, v in loaded.items()}
 66 | 
 67 |     # Adjust sf_filename to ensure correct formatting
 68 |     sf_filename = os.path.join(
 69 |         os.path.dirname(sf_filename), os.path.basename(rename(pt_filename))
 70 |     )
 71 | 
 72 |     save_file(loaded, sf_filename, metadata={"format": "pt"})
 73 |     check_file_size(sf_filename, pt_filename)
 74 |     reloaded = load_file(sf_filename)
 75 |     for k in loaded:
 76 |         pt_tensor = loaded[k]
 77 |         sf_tensor = reloaded[k]
 78 |         if not torch.equal(pt_tensor, sf_tensor):
 79 |             raise RuntimeError(f"The output tensors do not match for key {k}")
 80 | 
 81 | 
 82 | def convert_multi(input_dir: str, output_dir: str) -> list[str]:
 83 |     if os.path.exists(output_dir):
 84 |         logging.warning(f"{output_dir} already exists!")
 85 |         return []
 86 |     else:
 87 |         os.mkdir(output_dir)
 88 | 
 89 |     config_src = os.path.join(input_dir, "config.json")
 90 |     tokenizer_src = os.path.join(input_dir, "tokenizer.json")
 91 | 
 92 |     if not os.path.exists(config_src) or not os.path.exists(tokenizer_src):
 93 |         logging.warning(f"{config_src} or {tokenizer_src} not found. Skipping copy.")
 94 |         return []
 95 |     else:
 96 |         copy_file(config_src, output_dir)
 97 |         copy_file(tokenizer_src, output_dir)
 98 | 
 99 |     filenames = [file for file in os.listdir(input_dir) if file.endswith(".bin")]
100 | 
101 |     local_filenames = []
102 |     for filename in filenames:
103 |         pt_filename = os.path.join(input_dir, filename)
104 | 
105 |         sf_filename = rename(pt_filename)
106 |         sf_filename = os.path.join(output_dir, os.path.basename(sf_filename))
107 | 
108 |         logging.info(f"Converting {pt_filename} to {sf_filename}")
109 |         convert_file(pt_filename, sf_filename)
110 |         local_filenames.append(sf_filename)
111 | 
112 |     return local_filenames
113 | 
114 | 
115 | if __name__ == "__main__":
116 |     setup_logging()
117 | 
118 |     parser = argparse.ArgumentParser(description="Convert .bin files to .safetensors")
119 |     parser.add_argument(
120 |         "--input_dir",
121 |         type=str,
122 |         help="Path to the input directory containing .bin files",
123 |     )
124 |     parser.add_argument(
125 |         "--output_dir",
126 |         type=str,
127 |         help="Path to the output directory for .safetensors files",
128 |     )
129 |     args = parser.parse_args()
130 | 
131 |     output_filenames = convert_multi(args.input_dir, args.output_dir)
132 | 
133 |     logging.info("Conversion successful. Output files:")
134 |     for filename in output_filenames:
135 |         logging.info(filename)
136 | 


--------------------------------------------------------------------------------
/bench_ctransformers/bench.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | 
  4 | import torch
  5 | from ctransformers import AutoModelForCausalLM
  6 | from transformers import AutoTokenizer
  7 | 
  8 | sys.path.append(os.getcwd())
  9 | 
 10 | from common.base import BaseBenchmarkClass  # noqa
 11 | from common.utils import launch_cli, make_report  # noqa
 12 | 
 13 | 
 14 | class CTransformersBenchmark(BaseBenchmarkClass):
 15 |     def __init__(
 16 |         self,
 17 |         model_path: str,
 18 |         model_name: str,
 19 |         benchmark_name: str,
 20 |         precision: str,
 21 |         device: str,
 22 |         experiment_name: str,
 23 |     ) -> None:
 24 |         super().__init__(
 25 |             model_path=model_path,
 26 |             model_name=model_name,
 27 |             benchmark_name=benchmark_name,
 28 |             precision=precision,
 29 |             device=device,
 30 |             experiment_name=experiment_name,
 31 |         )
 32 | 
 33 |         if model_name == "llama":
 34 |             self.tokenizer_folder = os.path.join(
 35 |                 os.getcwd(), "models", "llama-2-7b-chat-hf"
 36 |             )
 37 |         else:
 38 |             self.tokenizer_folder = os.path.join(
 39 |                 os.getcwd(), "models", "mistral-7b-v0.1-instruct-hf"
 40 |             )
 41 | 
 42 |     def load_model_and_tokenizer(self):
 43 |         self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_folder)
 44 | 
 45 |         model_file_mapping = {
 46 |             "llama": {
 47 |                 "int4": "llama-2-7b-chat.Q4_K_M.gguf",
 48 |                 "int8": "llama-2-7b-chat.Q8_0.gguf",
 49 |             },
 50 |             "mistral": {
 51 |                 "int4": "mistral-7b-instruct-v0.1.Q4_K_M.gguf",
 52 |                 "int8": "mistral-7b-instruct-v0.1.Q8_0.gguf",
 53 |             },
 54 |         }
 55 | 
 56 |         self.model = AutoModelForCausalLM.from_pretrained(
 57 |             self.model_path,
 58 |             model_file=model_file_mapping[self.model_name][self.precision],
 59 |             model_type=self.model_name,
 60 |             gpu_layers=50 if self.device in ["cuda", "metal"] else 0,
 61 |             # context_length=1024 (This exceeds the memory without changing the quality)
 62 |         )
 63 |         return self
 64 | 
 65 |     def preprocess(self, prompt: str, chat_mode: bool = True, for_benchmarks=True):
 66 |         if chat_mode:
 67 |             template = self.get_chat_template_with_instruction(
 68 |                 prompt=prompt, for_benchmarks=for_benchmarks
 69 |             )
 70 |             prompt = self.tokenizer.apply_chat_template(template, tokenize=False)
 71 | 
 72 |         tokenized_input = self.tokenizer.encode(text=prompt)
 73 |         return {
 74 |             "prompt": prompt,
 75 |             "input_tokens": tokenized_input,
 76 |             "tensor": None,
 77 |             "num_input_tokens": len(tokenized_input),
 78 |         }
 79 | 
 80 |     def run_model(self, inputs: dict, max_tokens: int, temperature: float) -> dict:
 81 |         prompt = inputs["prompt"]
 82 |         output = self.model(
 83 |             prompt, stream=False, max_new_tokens=max_tokens, temperature=temperature
 84 |         )
 85 |         generated_tokens = self.tokenizer.encode(output)
 86 | 
 87 |         # Note: CTransformers produces tokens after the input tokens
 88 |         return {
 89 |             "output_prompt": output,
 90 |             "output_tokens": generated_tokens,
 91 |             "num_output_tokens": len(generated_tokens),
 92 |         }
 93 | 
 94 |     def postprocess(self, output: dict) -> str:
 95 |         output_tokens = output["output_tokens"]
 96 |         return self.tokenizer.decode(output_tokens, skip_special_tokens=True)
 97 | 
 98 |     def on_exit(self):
 99 |         if self.device in ["cuda:0", "cuda"]:
100 |             del self.model
101 |             torch.cuda.synchronize()
102 |         else:
103 |             del self.model
104 | 
105 | 
106 | if __name__ == "__main__":
107 |     parser = launch_cli(description="CTransformers Benchmark.")
108 |     args = parser.parse_args()
109 | 
110 |     model_folder = os.path.join(os.getcwd(), "models")
111 |     model_name = (
112 |         f"{args.model_name}-2-7b-chat-gguf"
113 |         if args.model_name == "llama"
114 |         else f"{args.model_name}-7b-v0.1-instruct-gguf"
115 |     )
116 | 
117 |     runner_dict = {
118 |         "cuda": [
119 |             {"precision": "int4", "model_path": os.path.join(model_folder, model_name)},
120 |             {"precision": "int8", "model_path": os.path.join(model_folder, model_name)},
121 |         ]
122 |     }
123 | 
124 |     make_report(
125 |         args=args,
126 |         benchmark_class=CTransformersBenchmark,
127 |         runner_dict=runner_dict,
128 |         benchmark_name="CTransformers",
129 |         is_bench_pytorch=False,
130 |     )
131 | 


--------------------------------------------------------------------------------
/bench_optimum_nvidia/bench.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | 
  4 | import torch
  5 | from optimum.nvidia import AutoModelForCausalLM
  6 | from transformers import AutoTokenizer
  7 | 
  8 | sys.path.append("/mnt")
  9 | sys.path.append("/mnt/benchmarks/")
 10 | 
 11 | from common.base import BaseBenchmarkClass  # noqa
 12 | from common.utils import launch_cli, make_report  # noqa
 13 | 
 14 | 
 15 | class OptimumBenchmark(BaseBenchmarkClass):
 16 |     def __init__(
 17 |         self,
 18 |         model_path: str,
 19 |         model_name: str,
 20 |         benchmark_name: str,
 21 |         precision: str,
 22 |         device: str,
 23 |         experiment_name: str,
 24 |     ) -> None:
 25 |         assert precision in ["float32", "float16"], ValueError(
 26 |             "Supported precision: 'float32' and 'float16'"
 27 |         )
 28 |         super().__init__(
 29 |             model_name=model_name,
 30 |             model_path=model_path,
 31 |             benchmark_name=benchmark_name,
 32 |             experiment_name=experiment_name,
 33 |             precision=precision,
 34 |             device=device,
 35 |             root_folder="/mnt/benchmarks",
 36 |         )
 37 | 
 38 |         if model_name == "llama":
 39 |             self.tokenizer_folder = os.path.join(
 40 |                 self.root_folder, "models", "llama-2-7b-chat-hf"
 41 |             )
 42 |         else:
 43 |             self.tokenizer_folder = os.path.join(
 44 |                 self.root_folder, "models", "mistral-7b-v0.1-instruct-hf"
 45 |             )
 46 | 
 47 |     def load_model_and_tokenizer(self):
 48 |         dtype_mapper = {"float16": torch.float16, "float32": torch.float32}
 49 |         self.model = AutoModelForCausalLM.from_pretrained(
 50 |             pretrained_model_name_or_path=self.model_path,
 51 |             torch_dtype=dtype_mapper[self.precision],
 52 |         )
 53 |         self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_folder)
 54 |         return self
 55 | 
 56 |     def preprocess(self, prompt: str, chat_mode: bool = True, for_benchmarks=True):
 57 |         if chat_mode:
 58 |             template = self.get_chat_template_with_instruction(
 59 |                 prompt=prompt, for_benchmarks=for_benchmarks
 60 |             )
 61 |             prompt = self.tokenizer.apply_chat_template(template, tokenize=False)
 62 | 
 63 |         tokenized_input = self.tokenizer.encode(text=prompt)
 64 |         tensor = self.tokenizer(prompt, return_tensors="pt")
 65 |         return {
 66 |             "prompt": prompt,
 67 |             "input_tokens": tokenized_input,
 68 |             "tensor": tensor,
 69 |             "num_input_tokens": len(tokenized_input),
 70 |         }
 71 | 
 72 |     def run_model(self, inputs: dict, max_tokens: int, temperature: float) -> dict:
 73 |         tensor = inputs["tensor"]
 74 |         num_input_tokens = inputs["num_input_tokens"]
 75 | 
 76 |         generated, _ = self.model.generate(
 77 |             **tensor,
 78 |             top_k=40,
 79 |             top_p=0.1,
 80 |             pad_token_id=self.tokenizer.eos_token_id,
 81 |             eos_token_id=self.tokenizer.eos_token_id,
 82 |             temperature=temperature,
 83 |             max_new_tokens=max_tokens,
 84 |         )
 85 | 
 86 |         output_tokens = generated[0].detach().tolist()[num_input_tokens:]
 87 |         return {"output_tokens": output_tokens, "num_output_tokens": len(output_tokens)}
 88 | 
 89 |     def postprocess(self, output: dict) -> str:
 90 |         output_tokens = output["output_tokens"]
 91 |         output_text = self.tokenizer.decode(output_tokens, skip_special_tokens=True)
 92 |         return output_text
 93 | 
 94 |     def on_exit(self):
 95 |         if self.device == "cuda:0":
 96 |             del self.model
 97 |             torch.cuda.synchronize()
 98 |         else:
 99 |             del self.model
100 | 
101 | 
102 | if __name__ == "__main__":
103 |     parser = launch_cli(description="HF-Optimum Nvidia Benchmark.")
104 |     args = parser.parse_args()
105 | 
106 |     model_folder = "/mnt/benchmarks/models"
107 |     model_name = (
108 |         f"{args.model_name}-2-7b-chat-optimum"
109 |         if args.model_name == "llama"
110 |         else f"{args.model_name}-7b-v0.1-instruct-optimum"
111 |     )
112 | 
113 |     runner_dict = {
114 |         "cuda": [
115 |             {
116 |                 "precision": "float32",
117 |                 "model_path": os.path.join(model_folder, model_name + "-float32"),
118 |             },
119 |             {
120 |                 "precision": "float16",
121 |                 "model_path": os.path.join(model_folder, model_name + "-float16"),
122 |             },
123 |         ]
124 |     }
125 | 
126 |     make_report(
127 |         args=args,
128 |         benchmark_class=OptimumBenchmark,
129 |         runner_dict=runner_dict,
130 |         benchmark_name="HF-Optimum Nvidia",
131 |         is_bench_pytorch=False,
132 |     )
133 | 


--------------------------------------------------------------------------------
/bench_optimum_nvidia/bench.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | ########################################################################################################
  4 | # Script: bench.sh
  5 | # Description: This script runs benchmarks HF Optimum Nvidia benchmark.
  6 | #
  7 | # Usage: ./bench.sh [OPTIONS]
  8 | # OPTIONS:
  9 | #   -p, --prompt        Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')
 10 | #   -r, --repetitions   Number of repetitions for benchmarks (default: 10)
 11 | #   -m, --max_tokens    Maximum number of tokens for benchmarks (default: 512)
 12 | #   -d, --device        Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')
 13 | #   -n, --model_name    The name of the model to benchmark (possible values: 'llama' for using Llama2, 'mistral' for using Mistral 7B v0.1)
 14 | #   -lf, --log_file     Logging file name.
 15 | #   -h, --help          Show this help message
 16 | ########################################################################################################
 17 | 
 18 | set -euo pipefail
 19 | 
 20 | print_usage() {
 21 |     echo "Usage: $0 [OPTIONS]"
 22 |     echo "OPTIONS:"
 23 |     echo "  -p, --prompt        Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')"
 24 |     echo "  -r, --repetitions   Number of repetitions for benchmarks (default: 10)"
 25 |     echo "  -m, --max_tokens    Maximum number of tokens for benchmarks (default: 512)"
 26 |     echo "  -d, --device        Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')"
 27 |     echo "  -n, --model_name    The name of the model to benchmark (possible values: 'llama' for using Llama2, 'mistral' for using Mistral 7B v0.1)"
 28 |     echo "  -lf, --log_file     Logging file name."
 29 |     echo "  -h, --help          Show this help message"
 30 |     exit 1
 31 | }
 32 | 
 33 | CURRENT_DIR="$(pwd)"
 34 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 35 | 
 36 | check_cuda() {
 37 |     if command -v nvcc &> /dev/null
 38 |     then
 39 |         echo -e "\nUsing CUDA"
 40 |         nvcc --version
 41 |     else
 42 |         echo -e "\nCUDA is not available."
 43 |         exit 1
 44 |     fi
 45 | }
 46 | 
 47 | check_platform() {
 48 |     local platform
 49 |     platform=$(uname -s)
 50 |     if [[ "$platform" == "Linux" ]]; then
 51 |         echo "Running on Linux."
 52 |     elif [[ "$platform" == "Darwin" ]]; then
 53 |         echo "Running on Mac OS."
 54 |     else
 55 |         echo "Unknown platform."
 56 |         exit 1
 57 |     fi
 58 | }
 59 | 
 60 | setup() {
 61 |     local MODEL_NAME="${1:-llama}"
 62 |     echo -e "\nSetting up with $SCRIPT_DIR/setup.sh..."
 63 |     bash "$SCRIPT_DIR/setup.sh" "$MODEL_NAME"
 64 | }
 65 | 
 66 | # Parse command-line arguments
 67 | while [ "$#" -gt 0 ]; do
 68 |     case "$1" in
 69 |         -p|--prompt)
 70 |             PROMPT="$2"
 71 |             shift 2
 72 |             ;;
 73 |         -r|--repetitions)
 74 |             REPETITIONS="$2"
 75 |             shift 2
 76 |             ;;
 77 |         -m|--max_tokens)
 78 |             MAX_TOKENS="$2"
 79 |             shift 2
 80 |             ;;
 81 |         -d|--device)
 82 |             DEVICE="$2"
 83 |             case "$DEVICE" in
 84 |                 "cuda" | "metal" | "cpu")
 85 |                     ;;
 86 |                 *)
 87 |                     echo "Invalid value for --device. Please use 'cuda', 'cpu' or 'metal'."
 88 |                     print_usage
 89 |                     ;;
 90 |             esac
 91 |             if [ "$DEVICE" == "cuda" ]; then
 92 |                 check_cuda
 93 |             else
 94 |                 echo "Not supported for $DEVICE"
 95 |                 exit 1
 96 |             fi
 97 |             shift 2
 98 |             ;;
 99 |         -n|--model_name)
100 |             MODEL_NAME="$2"
101 |             shift 2
102 |             ;;
103 |         -h|--help)
104 |             print_usage
105 |             ;;
106 |         *)
107 |             echo "Unknown option: $1"
108 |             print_usage
109 |             ;;
110 |     esac
111 | done
112 | 
113 | check_platform
114 | setup "$MODEL_NAME"
115 | 
116 | # Set default values if not provided
117 | PROMPT="${PROMPT:-"Write an essay about the transformer model architecture"}"
118 | REPETITIONS="${REPETITIONS:-10}"
119 | MAX_TOKENS="${MAX_TOKENS:-512}"
120 | DEVICE="${DEVICE:-'cuda'}"
121 | MODEL_NAME="${MODEL_NAME:-"llama"}"
122 | 
123 | 
124 | docker run \
125 |     --gpus all \
126 |     --ipc=host \
127 |     --ulimit memlock=-1 \
128 |     --ulimit stack=67108864 \
129 |     -e PYTHONUNBUFFERED=1 \
130 |     -v "$CURRENT_DIR:/mnt/benchmarks" \
131 |     -it huggingface/optimum-nvidia:latest \
132 |     python3 -u "/mnt/benchmarks/bench_optimum_nvidia/bench.py" \
133 |         --prompt "$PROMPT" \
134 |         --repetitions "$REPETITIONS" \
135 |         --max_tokens "$MAX_TOKENS" \
136 |         --model_name "$MODEL_NAME" \
137 |         --device "$DEVICE"
138 | 


--------------------------------------------------------------------------------
/bench_tensorrtllm/bench.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | ########################################################################################################
  4 | # Script: bench.sh
  5 | # Description: This script runs benchmarks Nvidia TensorRT LLM benchmark.
  6 | #
  7 | # Usage: ./bench.sh [OPTIONS]
  8 | # OPTIONS:
  9 | #   -p, --prompt        Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')
 10 | #   -r, --repetitions   Number of repetitions for benchmarks (default: 10)
 11 | #   -m, --max_tokens    Maximum number of tokens for benchmarks (default: 512)
 12 | #   -d, --device        Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')
 13 | #   -n, --model_name    The name of the model to benchmark (possible values: 'llama' for using Llama2, 'mistral' for using Mistral 7B v0.1)
 14 | #   -lf, --log_file     Logging file name.
 15 | #   -h, --help          Show this help message
 16 | ########################################################################################################
 17 | 
 18 | set -euo pipefail
 19 | 
 20 | print_usage() {
 21 |     echo "Usage: $0 [OPTIONS]"
 22 |     echo "OPTIONS:"
 23 |     echo "  -p, --prompt        Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')"
 24 |     echo "  -r, --repetitions   Number of repetitions for benchmarks (default: 10)"
 25 |     echo "  -m, --max_tokens    Maximum number of tokens for benchmarks (default: 512)"
 26 |     echo "  -d, --device        Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')"
 27 |     echo "  -n, --model_name    The name of the model to benchmark (possible values: 'llama' for using Llama2, 'mistral' for using Mistral 7B v0.1)"
 28 |     echo "  -lf, --log_file     Logging file name."
 29 |     echo "  -h, --help          Show this help message"
 30 |     exit 1
 31 | }
 32 | 
 33 | CURRENT_DIR="$(pwd)"
 34 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 35 | 
 36 | check_cuda() {
 37 |     if command -v nvcc &> /dev/null
 38 |     then
 39 |         echo -e "\nUsing CUDA"
 40 |         nvcc --version
 41 |     else
 42 |         echo -e "\nCUDA is not available."
 43 |         exit 1
 44 |     fi
 45 | }
 46 | 
 47 | check_platform() {
 48 |     local platform
 49 |     platform=$(uname -s)
 50 |     if [[ "$platform" == "Linux" ]]; then
 51 |         echo "Running on Linux."
 52 |     elif [[ "$platform" == "Darwin" ]]; then
 53 |         echo "Running on Mac OS."
 54 |     else
 55 |         echo "Unknown platform."
 56 |         exit 1
 57 |     fi
 58 | }
 59 | 
 60 | setup() {
 61 |     local MODEL_NAME="${1:-llama}"
 62 |     echo -e "\nSetting up with $SCRIPT_DIR/setup.sh..."
 63 |     bash "$SCRIPT_DIR/setup.sh" "$MODEL_NAME"
 64 | }
 65 | 
 66 | 
 67 | # Parse command-line arguments
 68 | while [ "$#" -gt 0 ]; do
 69 |     case "$1" in
 70 |         -p|--prompt)
 71 |             PROMPT="$2"
 72 |             shift 2
 73 |             ;;
 74 |         -r|--repetitions)
 75 |             REPETITIONS="$2"
 76 |             shift 2
 77 |             ;;
 78 |         -m|--max_tokens)
 79 |             MAX_TOKENS="$2"
 80 |             shift 2
 81 |             ;;
 82 |         -d|--device)
 83 |             DEVICE="$2"
 84 |             case "$DEVICE" in
 85 |                 "cuda" | "metal" | "cpu")
 86 |                     ;;
 87 |                 *)
 88 |                     echo "Invalid value for --device. Please use 'cuda', 'cpu' or 'metal'."
 89 |                     print_usage
 90 |                     ;;
 91 |             esac
 92 |             if [ "$DEVICE" == "cuda" ]; then
 93 |                 check_cuda
 94 |             else
 95 |                 echo "Not supported for $DEVICE"
 96 |                 exit 1
 97 |             fi
 98 |             shift 2
 99 |             ;;
100 |         -n|--model_name)
101 |             MODEL_NAME="$2"
102 |             shift 2
103 |             ;;
104 |         -h|--help)
105 |             print_usage
106 |             ;;
107 |         *)
108 |             echo "Unknown option: $1"
109 |             print_usage
110 |             ;;
111 |     esac
112 | done
113 | 
114 | check_platform
115 | check_python
116 | setup "$MODEL_NAME"
117 | 
118 | # Set default values if not provided
119 | PROMPT="${PROMPT:-"Write an essay about the transformer model architecture"}"
120 | REPETITIONS="${REPETITIONS:-10}"
121 | MAX_TOKENS="${MAX_TOKENS:-512}"
122 | DEVICE="${DEVICE:-'cuda'}"
123 | MODEL_NAME="${MODEL_NAME:-"llama"}"
124 | 
125 | 
126 | docker run \
127 |     --gpus all \
128 |     --ipc=host \
129 |     --ulimit memlock=-1 \
130 |     --ulimit stack=67108864 \
131 |     -e PYTHONUNBUFFERED=1 \
132 |     -v "$CURRENT_DIR:/mnt/benchmarks" \
133 |     -it tensorrt_llm/release:latest \
134 |     python3 -u "/mnt/benchmarks/bench_tensorrtllm/bench.py" \
135 |         --prompt "$PROMPT" \
136 |         --repetitions "$REPETITIONS" \
137 |         --max_tokens "$MAX_TOKENS" \
138 |         --model_name "$MODEL_NAME" \
139 |         --device "$DEVICE"
140 | 


--------------------------------------------------------------------------------
/bench_onnxruntime/bench.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | ########################################################################################################
  4 | # Script: bench.sh
  5 | # Description: This script runs benchmarks HF Optimum ONNX benchmark.
  6 | #
  7 | # Usage: ./bench.sh [OPTIONS]
  8 | # OPTIONS:
  9 | #   -p, --prompt        Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')
 10 | #   -r, --repetitions   Number of repetitions for benchmarks (default: 10)
 11 | #   -m, --max_tokens    Maximum number of tokens for benchmarks (default: 512)
 12 | #   -d, --device        Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')
 13 | #   -n, --model_name    The name of the model to benchmark (possible values: 'llama' for using Llama2, 'mistral' for using Mistral 7B v0.1)
 14 | #   -lf, --log_file     Logging file name.
 15 | #   -h, --help          Show this help message
 16 | ########################################################################################################
 17 | 
 18 | set -euo pipefail
 19 | 
 20 | print_usage() {
 21 |     echo "Usage: $0 [OPTIONS]"
 22 |     echo "OPTIONS:"
 23 |     echo "  -p, --prompt        Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')"
 24 |     echo "  -r, --repetitions   Number of repetitions for benchmarks (default: 10)"
 25 |     echo "  -m, --max_tokens    Maximum number of tokens for benchmarks (default: 512)"
 26 |     echo "  -d, --device        Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')"
 27 |     echo "  -n, --model_name    The name of the model to benchmark (possible values: 'llama' for using Llama2, 'mistral' for using Mistral 7B v0.1)"
 28 |     echo "  -lf, --log_file     Logging file name."
 29 |     echo "  -h, --help          Show this help message"
 30 |     exit 1
 31 | }
 32 | 
 33 | CURRENT_DIR="$(pwd)"
 34 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 35 | 
 36 | 
 37 | check_cuda() {
 38 |     if command -v nvcc &> /dev/null
 39 |     then
 40 |         echo -e "\nUsing CUDA"
 41 |         nvcc --version
 42 |     else
 43 |         echo -e "\nCUDA is not available."
 44 |         exit 1
 45 |     fi
 46 | }
 47 | 
 48 | check_platform() {
 49 |     local platform
 50 |     platform=$(uname -s)
 51 |     if [[ "$platform" == "Linux" ]]; then
 52 |         echo "Running on Linux."
 53 |     elif [[ "$platform" == "Darwin" ]]; then
 54 |         echo "Running on Mac OS."
 55 |     else
 56 |         echo "Unknown platform."
 57 |         exit 1
 58 |     fi
 59 | }
 60 | 
 61 | setup() {
 62 |     local MODEL_NAME="${1:-llama}"
 63 |     local DEVICE="$2"
 64 | 
 65 |     echo -e "\nSetting up with $SCRIPT_DIR/setup.sh..."
 66 |     bash "$SCRIPT_DIR/setup.sh" "$MODEL_NAME" "$DEVICE"
 67 | }
 68 | 
 69 | # Parse command-line arguments
 70 | while [ "$#" -gt 0 ]; do
 71 |     case "$1" in
 72 |         -p|--prompt)
 73 |             PROMPT="$2"
 74 |             shift 2
 75 |             ;;
 76 |         -r|--repetitions)
 77 |             REPETITIONS="$2"
 78 |             shift 2
 79 |             ;;
 80 |         -m|--max_tokens)
 81 |             MAX_TOKENS="$2"
 82 |             shift 2
 83 |             ;;
 84 |         -d|--device)
 85 |             DEVICE="$2"
 86 |             case "$DEVICE" in
 87 |                 "cuda" | "metal" | "cpu")
 88 |                     ;;
 89 |                 *)
 90 |                     echo "Invalid value for --device. Please use 'cuda', 'cpu' or 'metal'."
 91 |                     print_usage
 92 |                     ;;
 93 |             esac
 94 |             if [ "$DEVICE" == "cuda" ]; then
 95 |                 check_cuda
 96 |             else
 97 |                 echo "Not supported for $DEVICE"
 98 |                 exit 1
 99 |             fi
100 |             shift 2
101 |             ;;
102 |         -n|--model_name)
103 |             MODEL_NAME="$2"
104 |             shift 2
105 |             ;;
106 |         -h|--help)
107 |             print_usage
108 |             ;;
109 |         *)
110 |             echo "Unknown option: $1"
111 |             print_usage
112 |             ;;
113 |     esac
114 | done
115 | 
116 | check_platform
117 | check_python
118 | setup "$MODEL_NAME" "$DEVICE"
119 | 
120 | # Set default values if not provided
121 | PROMPT="${PROMPT:-"Write an essay about the transformer model architecture"}"
122 | REPETITIONS="${REPETITIONS:-10}"
123 | MAX_TOKENS="${MAX_TOKENS:-512}"
124 | DEVICE="${DEVICE:-'cuda'}"
125 | MODEL_NAME="${MODEL_NAME:-"llama"}"
126 | 
127 | docker run \
128 |     --gpus all \
129 |     --ipc=host \
130 |     --ulimit memlock=-1 \
131 |     --ulimit stack=67108864 \
132 |     -e PYTHONUNBUFFERED=1 \
133 |     -v "$CURRENT_DIR:/mnt/benchmarks" \
134 |     -it anindyadeep/onnxruntime:latest \
135 |     python3 -u "/mnt/benchmarks/bench_onnxruntime/bench.py" \
136 |         --prompt "$PROMPT" \
137 |         --repetitions "$REPETITIONS" \
138 |         --max_tokens "$MAX_TOKENS" \
139 |         --model_name "$MODEL_NAME" \
140 |         --device "$DEVICE"
141 | 


--------------------------------------------------------------------------------
/bench_vllm/bench.py:
--------------------------------------------------------------------------------
  1 | import gc
  2 | import os
  3 | import sys
  4 | 
  5 | import torch
  6 | from transformers import AutoTokenizer
  7 | from vllm import LLM, SamplingParams
  8 | from vllm.model_executor.parallel_utils import parallel_state
  9 | 
 10 | sys.path.append(os.getcwd())
 11 | 
 12 | from common.base import BaseBenchmarkClass  # noqa
 13 | from common.utils import launch_cli, make_report  # noqa
 14 | 
 15 | 
 16 | class VLLMBenchmark(BaseBenchmarkClass):
 17 |     def __init__(
 18 |         self,
 19 |         model_path: str,
 20 |         model_name: str,
 21 |         benchmark_name: str,
 22 |         precision: str,
 23 |         device: str,
 24 |         experiment_name: str,
 25 |     ) -> None:
 26 |         assert device == "cuda", ValueError("Only supported device is 'cuda'")
 27 |         assert precision in ["float16", "float32", "int4"], ValueError(
 28 |             "supported precision are: 'float16', 'float32' and 'int4'"
 29 |         )
 30 | 
 31 |         super().__init__(
 32 |             model_name=model_name,
 33 |             model_path=model_path,
 34 |             benchmark_name=benchmark_name,
 35 |             experiment_name=experiment_name,
 36 |             precision=precision,
 37 |             device=device,
 38 |         )
 39 | 
 40 |         if model_name == "llama":
 41 |             self.tokenizer_folder = os.path.join(
 42 |                 os.getcwd(), "models", "llama-2-7b-chat-hf"
 43 |             )
 44 |         else:
 45 |             self.tokenizer_folder = os.path.join(
 46 |                 os.getcwd(), "models", "mistral-7b-v0.1-instruct-hf"
 47 |             )
 48 | 
 49 |     def load_model_and_tokenizer(self):
 50 |         if self.precision == "int4":
 51 |             self.model = LLM(
 52 |                 model=self.model_path, quantization="AWQ", tensor_parallel_size=1
 53 |             )
 54 |         else:
 55 |             self.model = LLM(model=self.model_path)
 56 |             self.model.dtype = self.precision
 57 |         self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_folder)
 58 |         return self
 59 | 
 60 |     def preprocess(
 61 |         self, prompt: str, chat_mode: bool = True, for_benchmarks: bool = True
 62 |     ):
 63 |         if chat_mode:
 64 |             template = self.get_chat_template_with_instruction(
 65 |                 prompt=prompt, for_benchmarks=for_benchmarks
 66 |             )
 67 |             prompt = self.tokenizer.apply_chat_template(template, tokenize=False)
 68 | 
 69 |         tokenized_input = self.tokenizer.encode(text=prompt)
 70 |         return {
 71 |             "prompt": prompt,
 72 |             "input_tokens": tokenized_input,
 73 |             "tensor": None,
 74 |             "num_input_tokens": len(tokenized_input),
 75 |         }
 76 | 
 77 |     def run_model(self, inputs: dict, max_tokens: int, temperature: float) -> dict:
 78 |         prompt = [inputs["prompt"]]
 79 | 
 80 |         sampling_params = SamplingParams(max_tokens=max_tokens, temperature=temperature)
 81 |         output = self.model.generate(prompt, sampling_params)
 82 | 
 83 |         generated_text = output[0].outputs[0].text
 84 |         generated_tokens = output[0].outputs[0].token_ids
 85 | 
 86 |         return {
 87 |             "output_tokens": generated_tokens,
 88 |             "num_output_tokens": len(generated_tokens),
 89 |             "output_prompt": generated_text,
 90 |         }
 91 | 
 92 |     def postprocess(self, output: dict) -> str:
 93 |         return output["output_prompt"]
 94 | 
 95 |     def on_exit(self):
 96 |         if self.device == "cuda":
 97 |             parallel_state.destroy_model_parallel()
 98 |             del self.model
 99 |             gc.collect()
100 |             torch.cuda.empty_cache()
101 |             torch.distributed.destroy_process_group()
102 |             torch.cuda.synchronize()
103 |         else:
104 |             del self.model
105 | 
106 | 
107 | if __name__ == "__main__":
108 |     parser = launch_cli(description="vLLM Benchmark.")
109 |     args = parser.parse_args()
110 | 
111 |     model_folder = os.path.join(os.getcwd(), "models")
112 |     model_name = (
113 |         f"{args.model_name}-2-7b-chat-"
114 |         if args.model_name == "llama"
115 |         else f"{args.model_name}-7b-v0.1-instruct-"
116 |     )
117 | 
118 |     runner_dict = {
119 |         "cuda": [
120 |             {
121 |                 "precision": "float32",
122 |                 "model_path": os.path.join(model_folder, model_name + "hf"),
123 |             },
124 |             {
125 |                 "precision": "float16",
126 |                 "model_path": os.path.join(model_folder, model_name + "hf"),
127 |             },
128 |             {
129 |                 "precision": "int4",
130 |                 "model_path": os.path.join(model_folder, model_name + "autoawq"),
131 |             },
132 |         ]
133 |     }
134 | 
135 |     make_report(
136 |         args=args,
137 |         benchmark_class=VLLMBenchmark,
138 |         runner_dict=runner_dict,
139 |         benchmark_name="vLLM",
140 |         is_bench_pytorch=False,
141 |     )
142 | 


--------------------------------------------------------------------------------
/bench_pytorch/bench.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | ########################################################################################################
  4 | # Script: bench.sh
  5 | # Description: This script runs benchmarks AutoAWQ llama benchmark.
  6 | #
  7 | # Usage: ./bench.sh [OPTIONS]
  8 | # OPTIONS:
  9 | #   -p, --prompt        Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')
 10 | #   -r, --repetitions   Number of repetitions for benchmarks (default: 10)
 11 | #   -m, --max_tokens    Maximum number of tokens for benchmarks (default: 512)
 12 | #   -d, --device        Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')
 13 | #   -n, --model_name    The name of the model to benchmark (possible values: 'llama' for using Llama2, 'mistral' for using Mistral 7B v0.1)
 14 | #   -lf, --log_file     Logging file name.
 15 | #   -h, --help          Show this help message
 16 | ########################################################################################################
 17 | 
 18 | set -euo pipefail
 19 | 
 20 | print_usage() {
 21 |     echo "Usage: $0 [OPTIONS]"
 22 |     echo "OPTIONS:"
 23 |     echo "  -p, --prompt        Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')"
 24 |     echo "  -r, --repetitions   Number of repetitions for benchmarks (default: 10)"
 25 |     echo "  -m, --max_tokens    Maximum number of tokens for benchmarks (default: 512)"
 26 |     echo "  -d, --device        Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')"
 27 |     echo "  -n, --model_name    The name of the model to benchmark (possible values: 'llama' for using Llama2, 'mistral' for using Mistral 7B v0.1)"
 28 |     echo "  -lf, --log_file     Logging file name."
 29 |     echo "  -h, --help          Show this help message"
 30 |     exit 1
 31 | }
 32 | 
 33 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 34 | 
 35 | check_cuda() {
 36 |     if command -v nvcc &> /dev/null
 37 |     then
 38 |         echo -e "\nUsing CUDA"
 39 |         nvcc --version
 40 |     else
 41 |         echo -e "\nCUDA is not available."
 42 |         exit 1
 43 |     fi
 44 | }
 45 | 
 46 | check_platform() {
 47 |     local platform
 48 |     platform=$(uname -s)
 49 |     if [[ "$platform" == "Linux" ]]; then
 50 |         echo "Running on Linux."
 51 |     elif [[ "$platform" == "Darwin" ]]; then
 52 |         echo "Running on Mac OS."
 53 |     else
 54 |         echo "Unknown platform."
 55 |         exit 1
 56 |     fi
 57 | }
 58 | 
 59 | check_python() {
 60 |     if command -v python &> /dev/null; then
 61 |         PYTHON_CMD="python"
 62 |     elif command -v python3 &> /dev/null; then
 63 |         PYTHON_CMD="python3"
 64 |     else
 65 |         echo "Python is not installed."
 66 |         exit 1
 67 |     fi
 68 | }
 69 | 
 70 | setup() {
 71 |     echo -e "\nSetting up with $SCRIPT_DIR/setup.sh..."
 72 |     bash "$SCRIPT_DIR"/setup.sh
 73 | }
 74 | 
 75 | run_benchmarks() {
 76 |     local PROMPT="$1"
 77 |     local REPETITIONS="$2"
 78 |     local MAX_TOKENS="$3"
 79 |     local DEVICE="$4"
 80 |     local MODEL_NAME="$5"
 81 | 
 82 |     # shellcheck disable=SC1091
 83 |     source "$SCRIPT_DIR/venv/bin/activate"
 84 |     "$PYTHON_CMD" "$SCRIPT_DIR"/bench.py \
 85 |         --prompt "$PROMPT" \
 86 |         --repetitions "$REPETITIONS" \
 87 |         --max_tokens "$MAX_TOKENS" \
 88 |         --model_name "$MODEL_NAME" \
 89 |         --device "$DEVICE"
 90 | }
 91 | 
 92 | # Parse command-line arguments
 93 | while [ "$#" -gt 0 ]; do
 94 |     case "$1" in
 95 |         -p|--prompt)
 96 |             PROMPT="$2"
 97 |             shift 2
 98 |             ;;
 99 |         -r|--repetitions)
100 |             REPETITIONS="$2"
101 |             shift 2
102 |             ;;
103 |         -m|--max_tokens)
104 |             MAX_TOKENS="$2"
105 |             shift 2
106 |             ;;
107 |         -d|--device)
108 |             DEVICE="$2"
109 |             case "$DEVICE" in
110 |                 "cuda" | "metal" | "cpu")
111 |                     ;;
112 |                 *)
113 |                     echo "Invalid value for --device. Please use 'cuda', 'cpu' or 'metal'."
114 |                     print_usage
115 |                     ;;
116 |             esac
117 |             if [ "$DEVICE" == "cuda" ]; then
118 |                 check_cuda
119 |             else
120 |                 echo "Not supported for $DEVICE"
121 |                 exit 1
122 |             fi
123 |             shift 2
124 |             ;;
125 |         -n|--model_name)
126 |             MODEL_NAME="$2"
127 |             shift 2
128 |             ;;
129 |         -h|--help)
130 |             print_usage
131 |             ;;
132 |         *)
133 |             echo "Unknown option: $1"
134 |             print_usage
135 |             ;;
136 |     esac
137 | done
138 | 
139 | check_platform
140 | check_python
141 | setup
142 | 
143 | # Set default values if not provided
144 | PROMPT="${PROMPT:-"Write an essay about the transformer model architecture"}"
145 | REPETITIONS="${REPETITIONS:-10}"
146 | MAX_TOKENS="${MAX_TOKENS:-512}"
147 | DEVICE="${DEVICE:-'cuda'}"
148 | MODEL_NAME="${MODEL_NAME:-"llama"}"
149 | 
150 | run_benchmarks "$PROMPT" "$REPETITIONS" "$MAX_TOKENS" "$DEVICE" "$MODEL_NAME"
151 | 


--------------------------------------------------------------------------------
/bench_ctransformers/setup.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | ################################################################################
  4 | # Script: setup.sh
  5 | # Description: Automates the setup of a virtual environment and installs project
  6 | # requirements including CTransformers and GGUF weights.
  7 | ################################################################################
  8 | 
  9 | set -euo pipefail
 10 | 
 11 | # Define constants and paths
 12 | CURRENT_DIR="$(pwd)"
 13 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 14 | VENV_DIR="$SCRIPT_DIR/venv"
 15 | MODELS_DIR="$CURRENT_DIR/models"
 16 | LLAMA2_GGUF_WEIGHTS_DIR="$MODELS_DIR/llama-2-7b-chat-gguf"
 17 | MISTRAL_GGUF_WEIGHTS_DIR="$MODELS_DIR/mistral-7b-v0.1-instruct-gguf"
 18 | 
 19 | # Check if Python is installed
 20 | check_python() {
 21 |     if command -v python &> /dev/null; then
 22 |         PYTHON_CMD="python"
 23 |     elif command -v python3 &> /dev/null; then
 24 |         PYTHON_CMD="python3"
 25 |     else
 26 |         echo "Python is not installed."
 27 |         exit 1
 28 |     fi
 29 | }
 30 | 
 31 | install_ctransformers_cuda() {
 32 |     CUDA_VERSION=$(nvcc --version | grep "release" | sed -n 's/.*release \(.*\),.*/\1/p')
 33 | 
 34 |     if [ -z "$CUDA_VERSION" ]; then
 35 |         echo "CUDA is not installed or not found."
 36 |         exit 1
 37 |     fi
 38 | 
 39 |     CUDA_MAJOR=$(echo "$CUDA_VERSION" | cut -d. -f1)
 40 |     CUDA_MINOR=$(echo "$CUDA_VERSION" | cut -d. -f2)
 41 | 
 42 |    if [ "$CUDA_MAJOR" -gt 12 ] || { [ "$CUDA_MAJOR" -eq 12 ] && [ "$CUDA_MINOR" -ge 2 ]; }; then
 43 |         echo "Detected CUDA version >= 12.2"
 44 |         pip install ctransformers[cuda] > /dev/null
 45 |     else
 46 |         echo "Detected CUDA version < 12.2"
 47 |         CMAKE_ARGS="-DCMAKE_CUDA_COMPILER=$(which nvcc)" CT_CUBLAS=1 pip install ctransformers --no-binary ctransformers > /dev/null
 48 |     fi
 49 | }
 50 | 
 51 | # Install CTransformers based on the specified device
 52 | install_ctransformers() {
 53 |     local DEVICE="$1"
 54 | 
 55 |     case "$DEVICE" in
 56 |         cuda)
 57 |             echo "Installing CTransformers for CUDA."
 58 |             install_ctransformers_cuda
 59 |             ;;
 60 |         metal)
 61 |             echo "Installing CTransformers for Metal."
 62 |             pip uninstall ctransformers --yes
 63 |             CT_METAL=1 pip install ctransformers --no-binary ctransformers
 64 |             ;;
 65 |         cpu)
 66 |             echo "Installing CTransformers for CPU."
 67 |             pip install ctransformers > /dev/null
 68 |             ;;
 69 |         *)
 70 |             echo "Unsupported DEVICE: $DEVICE"
 71 |             exit 1
 72 |             ;;
 73 |     esac
 74 | }
 75 | 
 76 | # Download GGUF weights for the specified model
 77 | download_gguf_weights() {
 78 |     local MODEL_NAME="$1"
 79 |     local DOWNLOAD_DIR
 80 | 
 81 |     case "$MODEL_NAME" in
 82 |         llama)
 83 |             DOWNLOAD_DIR="$LLAMA2_GGUF_WEIGHTS_DIR"
 84 |             MODEL_IDENTIFIER="TheBloke/Llama-2-7B-Chat-GGUF"
 85 |             MODEL_FILE_4BIT="llama-2-7b-chat.Q4_K_M.gguf"
 86 |             MODEL_FILE_8BIT="llama-2-7b-chat.Q8_0.gguf"
 87 |             ;;
 88 |         mistral)
 89 |             DOWNLOAD_DIR="$MISTRAL_GGUF_WEIGHTS_DIR"
 90 |             MODEL_IDENTIFIER="TheBloke/Mistral-7B-Instruct-v0.1-GGUF"
 91 |             MODEL_FILE_4BIT="mistral-7b-instruct-v0.1.Q4_K_M.gguf"
 92 |             MODEL_FILE_8BIT="mistral-7b-instruct-v0.1.Q8_0.gguf"
 93 |             ;;
 94 |         *)
 95 |             echo "Invalid MODEL_NAME. Supported values: 'llama', 'mistral'"
 96 |             exit 1
 97 |             ;;
 98 |     esac
 99 | 
100 |     if [ ! -d "$DOWNLOAD_DIR" ]; then
101 |         huggingface-cli download "$MODEL_IDENTIFIER" "$MODEL_FILE_4BIT" --local-dir "$DOWNLOAD_DIR" --local-dir-use-symlinks False
102 |         huggingface-cli download "$MODEL_IDENTIFIER" "$MODEL_FILE_8BIT" --local-dir "$DOWNLOAD_DIR" --local-dir-use-symlinks False
103 |     else
104 |         echo "Weights for $MODEL_NAME already downloaded."
105 |     fi
106 | }
107 | 
108 | # Main script starts here
109 | 
110 | if [ "$#" -ne 2 ]; then
111 |     echo "Usage: $0 <DEVICE> <MODEL_NAME>"
112 |     exit 1
113 | fi
114 | 
115 | check_python
116 | 
117 | # Define command line arguments
118 | DEVICE="$1"
119 | MODEL_NAME="$2"
120 | 
121 | if [ ! -d "$VENV_DIR" ]; then
122 |     "$PYTHON_CMD" -m venv "$VENV_DIR"
123 |     echo "Virtual environment '$VENV_DIR' created."
124 |     if [ -f "$VENV_DIR/bin/activate" ]; then
125 |         # shellcheck disable=SC1091
126 |         source "$VENV_DIR/bin/activate"
127 |     else
128 |         echo "Error: Unable to find virtual environment activation script."
129 |         exit 1
130 |     fi
131 | 
132 |     "$PYTHON_CMD" -m pip install --upgrade pip > /dev/null
133 |     "$PYTHON_CMD" -m pip install -r "$SCRIPT_DIR/requirements.txt" --no-cache-dir > /dev/null
134 |     install_ctransformers "$DEVICE"
135 | else
136 |     if [ -f "$VENV_DIR/bin/activate" ]; then
137 |         # shellcheck disable=SC1091
138 |         source "$VENV_DIR/bin/activate"
139 |     else
140 |         echo "Error: Unable to find virtual environment activation script."
141 |         exit 1
142 |     fi
143 | fi
144 | 
145 | 
146 | download_gguf_weights "$MODEL_NAME"
147 | 


--------------------------------------------------------------------------------
/bench_deepspeed/bench.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | ########################################################################################################
  4 | # Script: bench.sh
  5 | # Description: This script runs DeepSpeed benchmark for Llama 2 Chat and Mistral v0.1 Instruct
  6 | #
  7 | # Usage: ./bench.sh [OPTIONS]
  8 | # OPTIONS:
  9 | #   -p, --prompt        Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')
 10 | #   -r, --repetitions   Number of repetitions for benchmarks (default: 10)
 11 | #   -m, --max_tokens    Maximum number of tokens for benchmarks (default: 512)
 12 | #   -d, --device        Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')
 13 | #   -n, --model_name    The name of the model to benchmark (possible values: 'llama' for using Llama2, 'mistral' for using Mistral 7B v0.1)
 14 | #   -lf, --log_file     Logging file name.
 15 | #   -h, --help          Show this help message
 16 | ########################################################################################################
 17 | 
 18 | set -euo pipefail
 19 | 
 20 | print_usage() {
 21 |     echo "Usage: $0 [OPTIONS]"
 22 |     echo "OPTIONS:"
 23 |     echo "  -p, --prompt        Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')"
 24 |     echo "  -r, --repetitions   Number of repetitions for benchmarks (default: 10)"
 25 |     echo "  -m, --max_tokens    Maximum number of tokens for benchmarks (default: 512)"
 26 |     echo "  -d, --device        Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')"
 27 |     echo "  -n, --model_name    The name of the model to benchmark (possible values: 'llama' for using Llama2, 'mistral' for using Mistral 7B v0.1)"
 28 |     echo "  -lf, --log_file     Logging file name."
 29 |     echo "  -h, --help          Show this help message"
 30 |     exit 1
 31 | }
 32 | 
 33 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 34 | 
 35 | 
 36 | check_cuda() {
 37 |     if command -v nvcc &> /dev/null
 38 |     then
 39 |         echo -e "\nUsing CUDA"
 40 |         nvcc --version
 41 |     else
 42 |         echo -e "\nCUDA is not available."
 43 |         exit 1
 44 |     fi
 45 | }
 46 | 
 47 | check_platform() {
 48 |     local platform
 49 |     platform=$(uname -s)
 50 |     if [[ "$platform" == "Linux" ]]; then
 51 |         echo "Running on Linux."
 52 |     elif [[ "$platform" == "Darwin" ]]; then
 53 |         echo "Running on Mac OS."
 54 |     else
 55 |         echo "Unknown platform."
 56 |         exit 1
 57 |     fi
 58 | }
 59 | 
 60 | check_python() {
 61 |     if command -v python &> /dev/null; then
 62 |         PYTHON_CMD="python"
 63 |     elif command -v python3 &> /dev/null; then
 64 |         PYTHON_CMD="python3"
 65 |     else
 66 |         echo "Python is not installed."
 67 |         exit 1
 68 |     fi
 69 | }
 70 | 
 71 | setup() {
 72 |     echo -e "\nSetting up with $SCRIPT_DIR/setup.sh..."
 73 |     bash "$SCRIPT_DIR"/setup.sh
 74 | }
 75 | 
 76 | run_benchmarks() {
 77 |     local PROMPT="$1"
 78 |     local REPETITIONS="$2"
 79 |     local MAX_TOKENS="$3"
 80 |     local DEVICE="$4"
 81 |     local MODEL_NAME="$5"
 82 | 
 83 |     # shellcheck disable=SC1091
 84 |     source "$SCRIPT_DIR/venv/bin/activate"
 85 |     "$PYTHON_CMD" "$SCRIPT_DIR"/bench.py \
 86 |         --prompt "$PROMPT" \
 87 |         --repetitions "$REPETITIONS" \
 88 |         --max_tokens "$MAX_TOKENS" \
 89 |         --model_name "$MODEL_NAME" \
 90 |         --device "$DEVICE"
 91 | }
 92 | 
 93 | while [ "$#" -gt 0 ]; do
 94 |     case "$1" in
 95 |         -p|--prompt)
 96 |             PROMPT="$2"
 97 |             shift 2
 98 |             ;;
 99 |         -r|--repetitions)
100 |             REPETITIONS="$2"
101 |             shift 2
102 |             ;;
103 |         -m|--max_tokens)
104 |             MAX_TOKENS="$2"
105 |             shift 2
106 |             ;;
107 |         -d|--device)
108 |             DEVICE="$2"
109 |             case "$DEVICE" in
110 |                 "cuda" | "metal" | "cpu")
111 |                     ;;
112 |                 *)
113 |                     echo "Invalid value for --device. Please use 'cuda', 'cpu' or 'metal'."
114 |                     print_usage
115 |                     ;;
116 |             esac
117 |             if [ "$DEVICE" == "cuda" ]; then
118 |                 check_cuda
119 |             else
120 |                 echo "Not supported for $DEVICE"
121 |                 exit 1
122 |             fi
123 |             shift 2
124 |             ;;
125 |         -n|--model_name)
126 |             MODEL_NAME="$2"
127 |             shift 2
128 |             ;;
129 |         -h|--help)
130 |             print_usage
131 |             ;;
132 |         *)
133 |             echo "Unknown option: $1"
134 |             print_usage
135 |             ;;
136 |     esac
137 | done
138 | 
139 | 
140 | check_platform
141 | check_python
142 | setup
143 | 
144 | # Set default values if not provided
145 | PROMPT="${PROMPT:-"Write an essay about the transformer model architecture"}"
146 | REPETITIONS="${REPETITIONS:-10}"
147 | MAX_TOKENS="${MAX_TOKENS:-512}"
148 | DEVICE="${DEVICE:-'cuda'}"
149 | MODEL_NAME="${MODEL_NAME:-"llama"}"
150 | 
151 | run_benchmarks "$PROMPT" "$REPETITIONS" "$MAX_TOKENS" "$DEVICE" "$MODEL_NAME"
152 | 


--------------------------------------------------------------------------------
/bench_exllamav2/bench.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | 
  4 | import torch
  5 | from exllamav2 import ExLlamaV2, ExLlamaV2Cache
  6 | from exllamav2.config import ExLlamaV2Config
  7 | from exllamav2.generator import ExLlamaV2BaseGenerator, ExLlamaV2Sampler
  8 | from exllamav2.tokenizer.tokenizer import ExLlamaV2Tokenizer
  9 | from transformers import AutoTokenizer
 10 | 
 11 | sys.path.append(os.getcwd())
 12 | 
 13 | from common.base import BaseBenchmarkClass  # noqa
 14 | from common.utils import launch_cli, make_report  # noqa
 15 | 
 16 | 
 17 | class ExLlamaV2Benchmark(BaseBenchmarkClass):
 18 |     def __init__(
 19 |         self,
 20 |         model_path: str,
 21 |         model_name: str,
 22 |         benchmark_name: str,
 23 |         precision: str,
 24 |         device: str,
 25 |         experiment_name: str,
 26 |     ) -> None:
 27 |         assert precision in ["int8", "int4"], ValueError(
 28 |             "Available precision: 'int8', 'int4'"
 29 |         )
 30 |         super().__init__(
 31 |             model_name=model_name,
 32 |             model_path=model_path,
 33 |             benchmark_name=benchmark_name,
 34 |             experiment_name=experiment_name,
 35 |             precision=precision,
 36 |             device=device,
 37 |         )
 38 | 
 39 |     def load_model_and_tokenizer(self):
 40 |         # set up model config
 41 |         self.config = ExLlamaV2Config()
 42 |         self.config.model_dir = self.model_path
 43 |         self.config.prepare()
 44 | 
 45 |         # set up model and cache
 46 |         self._model = ExLlamaV2(self.config)
 47 |         self.cache = ExLlamaV2Cache(self._model, lazy=True)
 48 |         self._model.load_autosplit(self.cache)
 49 |         self.tokenizer_exllama = ExLlamaV2Tokenizer(self.config)
 50 |         self.model = ExLlamaV2BaseGenerator(
 51 |             self._model, self.cache, self.tokenizer_exllama
 52 |         )
 53 |         self.model.warmup()
 54 | 
 55 |         # set up the huggingface tokenizer
 56 |         self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
 57 | 
 58 |         # set up exllamav2 settings
 59 |         self.settings = ExLlamaV2Sampler.Settings()
 60 |         self.settings.disallow_tokens(
 61 |             self.tokenizer_exllama, [self.tokenizer_exllama.eos_token_id]
 62 |         )
 63 |         return self
 64 | 
 65 |     def preprocess(
 66 |         self, prompt: str, chat_mode: bool = True, for_benchmarks: bool = True
 67 |     ):
 68 |         if chat_mode:
 69 |             template = self.get_chat_template_with_instruction(
 70 |                 prompt=prompt, for_benchmarks=for_benchmarks
 71 |             )
 72 |             prompt = self.tokenizer.apply_chat_template(template, tokenize=False)
 73 |         tokenized_input = self.tokenizer.encode(text=prompt)
 74 |         return {
 75 |             "prompt": prompt,
 76 |             "input_tokens": tokenized_input,
 77 |             "tensor": None,
 78 |             "num_input_tokens": len(tokenized_input),
 79 |         }
 80 | 
 81 |     def run_model(self, inputs: dict, max_tokens: int, temperature: float) -> dict:
 82 |         # first set up the settings
 83 |         self.settings.token_repetition_penalty = 1.01
 84 |         self.settings.temperature = temperature
 85 |         self.settings.top_k = 50
 86 |         self.settings.top_p = 0.1
 87 | 
 88 |         # now run the model
 89 |         prompt = inputs["prompt"]
 90 |         output_text = self.model.generate_simple(
 91 |             prompt,
 92 |             self.settings,
 93 |             max_tokens,
 94 |             seed=1234,
 95 |             completion_only=True,
 96 |             decode_special_tokens=True,
 97 |         )
 98 | 
 99 |         tokenized_output = self.tokenizer.encode(output_text)
100 |         return {
101 |             "output_text": output_text,
102 |             "output_tokens": tokenized_output,
103 |             "num_output_tokens": len(tokenized_output),
104 |         }
105 | 
106 |     def postprocess(self, output: dict) -> str:
107 |         return output["output_text"]
108 | 
109 |     def on_exit(self):
110 |         if self.device == "cuda":
111 |             del self.model
112 |             torch.cuda.synchronize()
113 |         else:
114 |             del self.model
115 | 
116 | 
117 | if __name__ == "__main__":
118 |     parser = launch_cli(description="ExLlamaV2 Benchmark.")
119 |     args = parser.parse_args()
120 | 
121 |     model_folder = os.path.join(os.getcwd(), "models")
122 |     model_name = (
123 |         f"{args.model_name}-2-7b-chat-exllamav2-"
124 |         if args.model_name == "llama"
125 |         else f"{args.model_name}-7b-v0.1-instruct-exllamav2-"
126 |     )
127 | 
128 |     runner_dict = {
129 |         "cuda": [
130 |             {
131 |                 "precision": "int4",
132 |                 "model_path": os.path.join(model_folder, model_name + "4.0-bit"),
133 |             },
134 |             {
135 |                 "precision": "int8",
136 |                 "model_path": os.path.join(model_folder, model_name + "8.0-bit"),
137 |             },
138 |         ]
139 |     }
140 | 
141 |     make_report(
142 |         args=args,
143 |         benchmark_class=ExLlamaV2Benchmark,
144 |         runner_dict=runner_dict,
145 |         benchmark_name="ExLlamaV2",
146 |         is_bench_pytorch=False,
147 |     )
148 | 


--------------------------------------------------------------------------------
/docs/ml_engines.md:
--------------------------------------------------------------------------------
 1 | # 🔧 ML Engines
 2 | 
 3 | ### Model Framework Support Matrix
 4 | 
 5 | | Engine                                     | Float32 | Float16 | Int8  | Int4  | CUDA  | ROCM  | Mac M1/M2 | Training |
 6 | | ------------------------------------------ | :-----: | :-----: | :---: | :---: | :---: | :---: | :-------: | :------: |
 7 | | [candle](/bench_candle/)                   |    ⚠️    |    ✅    |   ⚠️   |   ⚠️   |   ✅   |   ❌   |     🚧     |    ❌     |
 8 | | [llama.cpp](/bench_llamacpp/)              |    ❌    |    ❌    |   ✅   |   ✅   |   ✅   |   🚧   |     🚧     |    ❌     |
 9 | | [ctranslate](/bench_ctranslate/)           |    ✅    |    ✅    |   ✅   |   ❌   |   ✅   |   ❌   |     🚧     |    ❌     |
10 | | [onnx](/bench_onnxruntime/)                |    ✅    |    ✅    |   ❌   |   ❌   |   ✅   |   ⚠️   |     ❌     |    ❌     |
11 | | [transformers (pytorch)](/bench_pytorch/)  |    ✅    |    ✅    |   ✅   |   ✅   |   ✅   |   🚧   |     ✅     |    ✅     |
12 | | [vllm](/bench_vllm/)                       |    ✅    |    ✅    |   ❌   |   ✅   |   ✅   |   🚧   |     ❌     |    ❌     |
13 | | [exllamav2](/bench_exllamav2/)             |    ❌    |    ❌    |   ✅   |   ✅   |   ✅   |   🚧   |     ❌     |    ❌     |
14 | | [ctransformers](/bench_ctransformers/)     |    ❌    |    ❌    |   ✅   |   ✅   |   ✅   |   🚧   |     🚧     |    ❌     |
15 | | [AutoGPTQ](/bench_autogptq/)               |    ✅    |    ✅    |   ⚠️   |   ⚠️   |   ✅   |   ❌   |     ❌     |    ❌     |
16 | | [AutoAWQ](/bench_autoawq/)                 |    ❌    |    ❌    |   ❌   |   ✅   |   ✅   |   ❌   |     ❌     |    ❌     |
17 | | [DeepSpeed-MII](/bench_deepspeed/)         |    ❌    |    ✅    |   ❌   |   ❌   |   ✅   |   ❌   |     ❌     |    ⚠️     |
18 | | [PyTorch Lightning](/bench_lightning/)     |    ✅    |    ✅    |   ✅   |   ✅   |   ✅   |   ⚠️   |     ⚠️     |    ✅     |
19 | | [Optimum Nvidia](/bench_optimum_nvidia/)   |    ✅    |    ✅    |   ❌   |   ❌   |   ✅   |   ❌   |     ❌     |    ❌     |
20 | | [Nvidia TensorRT-LLM](/bench_tensorrtllm/) |    ✅    |    ✅    |   ✅   |   ✅   |   ✅   |   ❌   |     ❌     |    ❌     |
21 | 
22 | 
23 | ### Legend:
24 | - ✅ Supported
25 | - ❌ Not Supported
26 | - ⚠️ There is a catch related to this
27 | - 🚧 It is supported but not implemented in this current version
28 | 
29 | 
30 | ### Some pointers to note:
31 | The names are by the name of engines. Except when the name is `Generic` then it means that the nuance applies to all the engines.
32 | 
33 | 
34 | | Name              | Type | Description                                                                                                                                                                                                                            |
35 | | ----------------- | ---- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
36 | | candle            | ⚠️    | Metal backend is supported but it gives terrible performance even in small models like Phi2. For AMD ROCM there is no support as per this [issue](https://github.com/huggingface/candle/issues/346).                                   |
37 | | candle            | 🚧    | Latest performance for Candle is not implemented. If you want to see the numbers, please check out [archive.md](/docs/archive.md) which contains the benchmark numbers for [Llama 2 7B](https://huggingface.co/meta-llama/Llama-2-7b). |
38 | | ctranslate2       | ⚠️    | ROCM is not supported; however, works are in progress to have this feature on CTranslate2. No support for Mac M1/M2.                                                                                                                   |
39 | | onnxruntime       | ⚠️    | ONNXRuntime in general supports ROCM, but specific to LLMs and ONNXRuntime with HuggingFace Optimum only supports CUDAExecution provider right now. For CPU, it is available but super slow.                                           |
40 | | pytorch lightning | ⚠️    | ROCM is supported but not tested for PyTorch Lightning. See this [issue](https://github.com/Lightning-AI/litgpt/issues/1220).                                                                                                          |
41 | | pytorch lightning | ⚠️    | Metal is supported in PyTorch Lightning, but for Llama 2 7B Chat or Mistral 7B, it is super slow.                                                                                                                                      |
42 | | AutoGPTQ          | ⚠️    | AutoGPTQ is a weight-only quantization algorithm. Activation still remains in either float32 or float16. We used a 4-bit weight quantized model for our benchmarks experiment.                                                         |
43 | | Generic           | 🚧    | For all the engines which support metal, please check out [archive.md](/docs/archive.md) which contains the benchmark numbers for [Llama 2 7B](https://huggingface.co/meta-llama/Llama-2-7b).                                          |
44 | | Deepspeed         | ⚠️    | [DeepSpeed](https://github.com/microsoft/DeepSpeed) supports training; however, for inference, we have used [DeepSpeed MII](https://github.com/microsoft/DeepSpeed-MII).                                                               |
45 | 


--------------------------------------------------------------------------------
/bench_exllamav2/bench.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | ########################################################################################################
  4 | # Script: bench.sh
  5 | # Description: This script runs benchmarks ExLlamaV2 benchmark.
  6 | #
  7 | # Usage: ./bench.sh [OPTIONS]
  8 | # OPTIONS:
  9 | #   -p, --prompt        Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')
 10 | #   -r, --repetitions   Number of repetitions for benchmarks (default: 10)
 11 | #   -m, --max_tokens    Maximum number of tokens for benchmarks (default: 512)
 12 | #   -d, --device        Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')
 13 | #   -n, --model_name    The name of the model to benchmark (possible values: 'llama' for using Llama2, 'mistral' for using Mistral 7B v0.1)
 14 | #   -lf, --log_file     Logging file name.
 15 | #   -h, --help          Show this help message
 16 | ########################################################################################################
 17 | 
 18 | set -euo pipefail
 19 | 
 20 | print_usage() {
 21 |     echo "Usage: $0 [OPTIONS]"
 22 |     echo "OPTIONS:"
 23 |     echo "  -p, --prompt        Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')"
 24 |     echo "  -r, --repetitions   Number of repetitions for benchmarks (default: 10)"
 25 |     echo "  -m, --max_tokens    Maximum number of tokens for benchmarks (default: 512)"
 26 |     echo "  -d, --device        Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')"
 27 |     echo "  -n, --model_name    The name of the model to benchmark (possible values: 'llama' for using Llama2, 'mistral' for using Mistral 7B v0.1)"
 28 |     echo "  -lf, --log_file     Logging file name."
 29 |     echo "  -h, --help          Show this help message"
 30 |     exit 1
 31 | }
 32 | 
 33 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 34 | 
 35 | check_cuda() {
 36 |     if command -v nvcc &> /dev/null
 37 |     then
 38 |         echo -e "\nUsing CUDA"
 39 |         nvcc --version
 40 |     else
 41 |         echo -e "\nCUDA is not available."
 42 |         exit 1
 43 |     fi
 44 | }
 45 | 
 46 | check_platform() {
 47 |     local platform
 48 |     platform=$(uname -s)
 49 |     if [[ "$platform" == "Linux" ]]; then
 50 |         echo "Running on Linux."
 51 |     elif [[ "$platform" == "Darwin" ]]; then
 52 |         echo "Running on Mac OS."
 53 |     else
 54 |         echo "Unknown platform."
 55 |         exit 1
 56 |     fi
 57 | }
 58 | 
 59 | check_python() {
 60 |     if command -v python &> /dev/null; then
 61 |         PYTHON_CMD="python"
 62 |     elif command -v python3 &> /dev/null; then
 63 |         PYTHON_CMD="python3"
 64 |     else
 65 |         echo "Python is not installed."
 66 |         exit 1
 67 |     fi
 68 | }
 69 | 
 70 | setup() {
 71 |     local MODEL_NAME="${1:-llama}"
 72 |     echo -e "\nSetting up with $SCRIPT_DIR/setup.sh..."
 73 |     bash "$SCRIPT_DIR/setup.sh" "$MODEL_NAME"
 74 | }
 75 | 
 76 | run_benchmarks() {
 77 |     local PROMPT="$1"
 78 |     local REPETITIONS="$2"
 79 |     local MAX_TOKENS="$3"
 80 |     local DEVICE="$4"
 81 |     local MODEL_NAME="$5"
 82 | 
 83 |     # shellcheck disable=SC1091
 84 |     source "$SCRIPT_DIR/venv/bin/activate"
 85 |     "$PYTHON_CMD" "$SCRIPT_DIR"/bench.py \
 86 |         --prompt "$PROMPT" \
 87 |         --repetitions "$REPETITIONS" \
 88 |         --max_tokens "$MAX_TOKENS" \
 89 |         --model_name "$MODEL_NAME" \
 90 |         --device "$DEVICE"
 91 | }
 92 | 
 93 | 
 94 | while [ "$#" -gt 0 ]; do
 95 |     case "$1" in
 96 |         -p|--prompt)
 97 |             PROMPT="$2"
 98 |             shift 2
 99 |             ;;
100 |         -r|--repetitions)
101 |             REPETITIONS="$2"
102 |             shift 2
103 |             ;;
104 |         -m|--max_tokens)
105 |             MAX_TOKENS="$2"
106 |             shift 2
107 |             ;;
108 |         -d|--device)
109 |             DEVICE="$2"
110 |             case "$DEVICE" in
111 |                 "cuda" | "metal" | "cpu")
112 |                     ;;
113 |                 *)
114 |                     echo "Invalid value for --device. Please use 'cuda', 'cpu' or 'metal'."
115 |                     print_usage
116 |                     ;;
117 |             esac
118 |             if [ "$DEVICE" == "cuda" ]; then
119 |                 check_cuda
120 |             else
121 |                 echo "Not supported for $DEVICE"
122 |                 exit 1
123 |             fi
124 |             shift 2
125 |             ;;
126 |         -n|--model_name)
127 |             MODEL_NAME="$2"
128 |             shift 2
129 |             ;;
130 |         -h|--help)
131 |             print_usage
132 |             ;;
133 |         *)
134 |             echo "Unknown option: $1"
135 |             print_usage
136 |             ;;
137 |     esac
138 | done
139 | 
140 | check_platform
141 | check_python
142 | setup "$MODEL_NAME"
143 | 
144 | # Set default values if not provided
145 | PROMPT="${PROMPT:-"Write an essay about the transformer model architecture"}"
146 | REPETITIONS="${REPETITIONS:-10}"
147 | MAX_TOKENS="${MAX_TOKENS:-512}"
148 | DEVICE="${DEVICE:-'cuda'}"
149 | MODEL_NAME="${MODEL_NAME:-"llama"}"
150 | 
151 | run_benchmarks "$PROMPT" "$REPETITIONS" "$MAX_TOKENS" "$DEVICE" "$MODEL_NAME"
152 | 


--------------------------------------------------------------------------------
/bench_autogptq/bench.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | ########################################################################################################
  4 | # Script: bench.sh
  5 | # Description: This script runs benchmarks AutoGPTQ benchmark.
  6 | #
  7 | # Usage: ./bench.sh [OPTIONS]
  8 | # OPTIONS:
  9 | #   -p, --prompt        Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')
 10 | #   -r, --repetitions   Number of repetitions for benchmarks (default: 10)
 11 | #   -m, --max_tokens    Maximum number of tokens for benchmarks (default: 512)
 12 | #   -d, --device        Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')
 13 | #   -n, --model_name    The name of the model to benchmark (possible values: 'llama' for using Llama2, 'mistral' for using Mistral 7B v0.1)
 14 | #   -lf, --log_file     Logging file name.
 15 | #   -h, --help          Show this help message
 16 | ########################################################################################################
 17 | 
 18 | set -euo pipefail
 19 | 
 20 | print_usage() {
 21 |     echo "Usage: $0 [OPTIONS]"
 22 |     echo "OPTIONS:"
 23 |     echo "  -p, --prompt        Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')"
 24 |     echo "  -r, --repetitions   Number of repetitions for benchmarks (default: 10)"
 25 |     echo "  -m, --max_tokens    Maximum number of tokens for benchmarks (default: 512)"
 26 |     echo "  -d, --device        Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')"
 27 |     echo "  -n, --model_name    The name of the model to benchmark (possible values: 'llama' for using Llama2, 'mistral' for using Mistral 7B v0.1)"
 28 |     echo "  -lf, --log_file     Logging file name."
 29 |     echo "  -h, --help          Show this help message"
 30 |     exit 1
 31 | }
 32 | 
 33 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 34 | 
 35 | 
 36 | check_cuda() {
 37 |     if command -v nvcc &> /dev/null
 38 |     then
 39 |         echo -e "\nUsing CUDA"
 40 |         nvcc --version
 41 |     else
 42 |         echo -e "\nCUDA is not available."
 43 |         exit 1
 44 |     fi
 45 | }
 46 | 
 47 | check_platform() {
 48 |     local platform
 49 |     platform=$(uname -s)
 50 |     if [[ "$platform" == "Linux" ]]; then
 51 |         echo "Running on Linux."
 52 |     elif [[ "$platform" == "Darwin" ]]; then
 53 |         echo "Running on Mac OS."
 54 |     else
 55 |         echo "Unknown platform."
 56 |         exit 1
 57 |     fi
 58 | }
 59 | 
 60 | check_python() {
 61 |     if command -v python &> /dev/null; then
 62 |         PYTHON_CMD="python"
 63 |     elif command -v python3 &> /dev/null; then
 64 |         PYTHON_CMD="python3"
 65 |     else
 66 |         echo "Python is not installed."
 67 |         exit 1
 68 |     fi
 69 | }
 70 | 
 71 | 
 72 | setup() {
 73 |     local MODEL_NAME="${1:-llama}"
 74 |     echo -e "\nSetting up with $SCRIPT_DIR/setup.sh..."
 75 |     bash "$SCRIPT_DIR/setup.sh" "$MODEL_NAME"
 76 | }
 77 | 
 78 | run_benchmarks() {
 79 |     local PROMPT="$1"
 80 |     local REPETITIONS="$2"
 81 |     local MAX_TOKENS="$3"
 82 |     local DEVICE="$4"
 83 |     local MODEL_NAME="$5"
 84 | 
 85 |     # shellcheck disable=SC1091
 86 |     source "$SCRIPT_DIR/venv/bin/activate"
 87 |     "$PYTHON_CMD" "$SCRIPT_DIR"/bench.py \
 88 |         --prompt "$PROMPT" \
 89 |         --repetitions "$REPETITIONS" \
 90 |         --max_tokens "$MAX_TOKENS" \
 91 |         --model_name "$MODEL_NAME" \
 92 |         --device "$DEVICE"
 93 | }
 94 | 
 95 | while [ "$#" -gt 0 ]; do
 96 |     case "$1" in
 97 |         -p|--prompt)
 98 |             PROMPT="$2"
 99 |             shift 2
100 |             ;;
101 |         -r|--repetitions)
102 |             REPETITIONS="$2"
103 |             shift 2
104 |             ;;
105 |         -m|--max_tokens)
106 |             MAX_TOKENS="$2"
107 |             shift 2
108 |             ;;
109 |         -d|--device)
110 |             DEVICE="$2"
111 |             case "$DEVICE" in
112 |                 "cuda" | "metal" | "cpu")
113 |                     ;;
114 |                 *)
115 |                     echo "Invalid value for --device. Please use 'cuda', 'cpu' or 'metal'."
116 |                     print_usage
117 |                     ;;
118 |             esac
119 |             if [ "$DEVICE" == "cuda" ]; then
120 |                 check_cuda
121 |             else
122 |                 echo "Not supported for $DEVICE"
123 |                 exit 1
124 |             fi
125 |             shift 2
126 |             ;;
127 |         -n|--model_name)
128 |             MODEL_NAME="$2"
129 |             shift 2
130 |             ;;
131 |         -h|--help)
132 |             print_usage
133 |             ;;
134 |         *)
135 |             echo "Unknown option: $1"
136 |             print_usage
137 |             ;;
138 |     esac
139 | done
140 | 
141 | check_platform
142 | check_python
143 | setup "$MODEL_NAME"
144 | 
145 | # Set default values if not provided
146 | PROMPT="${PROMPT:-"Write an essay about the transformer model architecture"}"
147 | REPETITIONS="${REPETITIONS:-10}"
148 | MAX_TOKENS="${MAX_TOKENS:-512}"
149 | DEVICE="${DEVICE:-'cuda'}"
150 | MODEL_NAME="${MODEL_NAME:-"llama"}"
151 | 
152 | run_benchmarks "$PROMPT" "$REPETITIONS" "$MAX_TOKENS" "$DEVICE" "$MODEL_NAME"
153 | 


--------------------------------------------------------------------------------
/bench_autoawq/bench.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | ########################################################################################################
  4 | # Script: bench.sh
  5 | # Description: This script runs benchmarks AutoAWQ llama benchmark.
  6 | #
  7 | # Usage: ./bench.sh [OPTIONS]
  8 | # OPTIONS:
  9 | #   -p, --prompt        Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')
 10 | #   -r, --repetitions   Number of repetitions for benchmarks (default: 10)
 11 | #   -m, --max_tokens    Maximum number of tokens for benchmarks (default: 512)
 12 | #   -d, --device        Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')
 13 | #   -n, --model_name    The name of the model to benchmark (possible values: 'llama' for using Llama2, 'mistral' for using Mistral 7B v0.1)
 14 | #   -lf, --log_file     Logging file name.
 15 | #   -h, --help          Show this help message
 16 | ########################################################################################################
 17 | 
 18 | set -euo pipefail
 19 | 
 20 | print_usage() {
 21 |     echo "Usage: $0 [OPTIONS]"
 22 |     echo "OPTIONS:"
 23 |     echo "  -p, --prompt        Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')"
 24 |     echo "  -r, --repetitions   Number of repetitions for benchmarks (default: 10)"
 25 |     echo "  -m, --max_tokens    Maximum number of tokens for benchmarks (default: 512)"
 26 |     echo "  -d, --device        Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')"
 27 |     echo "  -n, --model_name    The name of the model to benchmark (possible values: 'llama' for using Llama2, 'mistral' for using Mistral 7B v0.1)"
 28 |     echo "  -lf, --log_file     Logging file name."
 29 |     echo "  -h, --help          Show this help message"
 30 |     exit 1
 31 | }
 32 | 
 33 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 34 | 
 35 | 
 36 | check_cuda() {
 37 |     if command -v nvcc &> /dev/null
 38 |     then
 39 |         echo -e "\nUsing CUDA"
 40 |         nvcc --version
 41 |     else
 42 |         echo -e "\nCUDA is not available."
 43 |         exit 1
 44 |     fi
 45 | }
 46 | 
 47 | check_platform() {
 48 |     local platform
 49 |     platform=$(uname -s)
 50 |     if [[ "$platform" == "Linux" ]]; then
 51 |         echo "Running on Linux."
 52 |     elif [[ "$platform" == "Darwin" ]]; then
 53 |         echo "Running on Mac OS."
 54 |     else
 55 |         echo "Unknown platform."
 56 |         exit 1
 57 |     fi
 58 | }
 59 | 
 60 | check_python() {
 61 |     if command -v python &> /dev/null; then
 62 |         PYTHON_CMD="python"
 63 |     elif command -v python3 &> /dev/null; then
 64 |         PYTHON_CMD="python3"
 65 |     else
 66 |         echo "Python is not installed."
 67 |         exit 1
 68 |     fi
 69 | }
 70 | 
 71 | 
 72 | setup() {
 73 |     local MODEL_NAME="${1:-llama}"
 74 |     echo -e "\nSetting up with $SCRIPT_DIR/setup.sh..."
 75 |     bash "$SCRIPT_DIR/setup.sh" "$MODEL_NAME"
 76 | }
 77 | 
 78 | run_benchmarks() {
 79 |     local PROMPT="$1"
 80 |     local REPETITIONS="$2"
 81 |     local MAX_TOKENS="$3"
 82 |     local DEVICE="$4"
 83 |     local MODEL_NAME="$5"
 84 | 
 85 |     # shellcheck disable=SC1091
 86 |     source "$SCRIPT_DIR/venv/bin/activate"
 87 |     "$PYTHON_CMD" "$SCRIPT_DIR"/bench.py \
 88 |         --prompt "$PROMPT" \
 89 |         --repetitions "$REPETITIONS" \
 90 |         --max_tokens "$MAX_TOKENS" \
 91 |         --model_name "$MODEL_NAME" \
 92 |         --device "$DEVICE"
 93 | }
 94 | 
 95 | while [ "$#" -gt 0 ]; do
 96 |     case "$1" in
 97 |         -p|--prompt)
 98 |             PROMPT="$2"
 99 |             shift 2
100 |             ;;
101 |         -r|--repetitions)
102 |             REPETITIONS="$2"
103 |             shift 2
104 |             ;;
105 |         -m|--max_tokens)
106 |             MAX_TOKENS="$2"
107 |             shift 2
108 |             ;;
109 |         -d|--device)
110 |             DEVICE="$2"
111 |             case "$DEVICE" in
112 |                 "cuda" | "metal" | "cpu")
113 |                     ;;
114 |                 *)
115 |                     echo "Invalid value for --device. Please use 'cuda', 'cpu' or 'metal'."
116 |                     print_usage
117 |                     ;;
118 |             esac
119 |             if [ "$DEVICE" == "cuda" ]; then
120 |                 check_cuda
121 |             else
122 |                 echo "Not supported for $DEVICE"
123 |                 exit 1
124 |             fi
125 |             shift 2
126 |             ;;
127 |         -n|--model_name)
128 |             MODEL_NAME="$2"
129 |             shift 2
130 |             ;;
131 |         -h|--help)
132 |             print_usage
133 |             ;;
134 |         *)
135 |             echo "Unknown option: $1"
136 |             print_usage
137 |             ;;
138 |     esac
139 | done
140 | 
141 | check_platform
142 | check_python
143 | setup "$MODEL_NAME"
144 | 
145 | # Set default values if not provided
146 | PROMPT="${PROMPT:-"Write an essay about the transformer model architecture"}"
147 | REPETITIONS="${REPETITIONS:-10}"
148 | MAX_TOKENS="${MAX_TOKENS:-512}"
149 | DEVICE="${DEVICE:-'cuda'}"
150 | MODEL_NAME="${MODEL_NAME:-"llama"}"
151 | 
152 | run_benchmarks "$PROMPT" "$REPETITIONS" "$MAX_TOKENS" "$DEVICE" "$MODEL_NAME"
153 | 


--------------------------------------------------------------------------------
/bench_lightning/bench.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | ########################################################################################################
  4 | # Script: bench.sh
  5 | # Description: This script runs benchmarks PyTorch Lightning benchmark.
  6 | #
  7 | # Usage: ./bench.sh [OPTIONS]
  8 | # OPTIONS:
  9 | #   -p, --prompt        Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')
 10 | #   -r, --repetitions   Number of repetitions for benchmarks (default: 10)
 11 | #   -m, --max_tokens    Maximum number of tokens for benchmarks (default: 512)
 12 | #   -d, --device        Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')
 13 | #   -n, --model_name    The name of the model to benchmark (possible values: 'llama' for using Llama2, 'mistral' for using Mistral 7B v0.1)
 14 | #   -lf, --log_file     Logging file name.
 15 | #   -h, --help          Show this help message
 16 | ########################################################################################################
 17 | 
 18 | set -euo pipefail
 19 | 
 20 | print_usage() {
 21 |     echo "Usage: $0 [OPTIONS]"
 22 |     echo "OPTIONS:"
 23 |     echo "  -p, --prompt        Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')"
 24 |     echo "  -r, --repetitions   Number of repetitions for benchmarks (default: 10)"
 25 |     echo "  -m, --max_tokens    Maximum number of tokens for benchmarks (default: 512)"
 26 |     echo "  -d, --device        Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')"
 27 |     echo "  -n, --model_name    The name of the model to benchmark (possible values: 'llama' for using Llama2, 'mistral' for using Mistral 7B v0.1)"
 28 |     echo "  -lf, --log_file     Logging file name."
 29 |     echo "  -h, --help          Show this help message"
 30 |     exit 1
 31 | }
 32 | 
 33 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 34 | 
 35 | check_cuda() {
 36 |     if command -v nvcc &> /dev/null
 37 |     then
 38 |         echo -e "\nUsing CUDA"
 39 |         nvcc --version
 40 |     else
 41 |         echo -e "\nCUDA is not available."
 42 |         exit 1
 43 |     fi
 44 | }
 45 | 
 46 | check_platform() {
 47 |     local platform
 48 |     platform=$(uname -s)
 49 |     if [[ "$platform" == "Linux" ]]; then
 50 |         echo "Running on Linux."
 51 |     elif [[ "$platform" == "Darwin" ]]; then
 52 |         echo "Running on Mac OS."
 53 |     else
 54 |         echo "Unknown platform."
 55 |         exit 1
 56 |     fi
 57 | }
 58 | 
 59 | check_python() {
 60 |     if command -v python &> /dev/null; then
 61 |         PYTHON_CMD="python"
 62 |     elif command -v python3 &> /dev/null; then
 63 |         PYTHON_CMD="python3"
 64 |     else
 65 |         echo "Python is not installed."
 66 |         exit 1
 67 |     fi
 68 | }
 69 | 
 70 | 
 71 | setup() {
 72 |     local MODEL_NAME="${1:-llama}"
 73 |     echo -e "\nSetting up with $SCRIPT_DIR/setup.sh..."
 74 |     bash "$SCRIPT_DIR/setup.sh" "$MODEL_NAME"
 75 | }
 76 | 
 77 | run_benchmarks() {
 78 |     local PROMPT="$1"
 79 |     local REPETITIONS="$2"
 80 |     local MAX_TOKENS="$3"
 81 |     local DEVICE="$4"
 82 |     local MODEL_NAME="$5"
 83 | 
 84 |     # shellcheck disable=SC1091
 85 |     source "$SCRIPT_DIR/venv/bin/activate"
 86 |     "$PYTHON_CMD" "$SCRIPT_DIR"/bench.py \
 87 |         --prompt "$PROMPT" \
 88 |         --repetitions "$REPETITIONS" \
 89 |         --max_tokens "$MAX_TOKENS" \
 90 |         --model_name "$MODEL_NAME" \
 91 |         --device "$DEVICE"
 92 | }
 93 | 
 94 | while [ "$#" -gt 0 ]; do
 95 |     case "$1" in
 96 |         -p|--prompt)
 97 |             PROMPT="$2"
 98 |             shift 2
 99 |             ;;
100 |         -r|--repetitions)
101 |             REPETITIONS="$2"
102 |             shift 2
103 |             ;;
104 |         -m|--max_tokens)
105 |             MAX_TOKENS="$2"
106 |             shift 2
107 |             ;;
108 |         -d|--device)
109 |             DEVICE="$2"
110 |             case "$DEVICE" in
111 |                 "cuda" | "metal" | "cpu")
112 |                     ;;
113 |                 *)
114 |                     echo "Invalid value for --device. Please use 'cuda', 'cpu' or 'metal'."
115 |                     print_usage
116 |                     ;;
117 |             esac
118 |             if [ "$DEVICE" == "cuda" ]; then
119 |                 check_cuda
120 |             else
121 |                 echo "Not supported for $DEVICE"
122 |                 exit 1
123 |             fi
124 |             shift 2
125 |             ;;
126 |         -n|--model_name)
127 |             MODEL_NAME="$2"
128 |             shift 2
129 |             ;;
130 |         -h|--help)
131 |             print_usage
132 |             ;;
133 |         *)
134 |             echo "Unknown option: $1"
135 |             print_usage
136 |             ;;
137 |     esac
138 | done
139 | 
140 | check_platform
141 | check_python
142 | setup "$MODEL_NAME"
143 | 
144 | # Set default values if not provided
145 | PROMPT="${PROMPT:-"Write an essay about the transformer model architecture"}"
146 | REPETITIONS="${REPETITIONS:-10}"
147 | MAX_TOKENS="${MAX_TOKENS:-512}"
148 | DEVICE="${DEVICE:-'cuda'}"
149 | MODEL_NAME="${MODEL_NAME:-"llama"}"
150 | 
151 | run_benchmarks "$PROMPT" "$REPETITIONS" "$MAX_TOKENS" "$DEVICE" "$MODEL_NAME"
152 | 


--------------------------------------------------------------------------------
/bench_vllm/bench.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | ########################################################################################################
  4 | # Script: bench.sh
  5 | # Description: This script runs benchmarks vLLM benchmark.
  6 | #
  7 | # Usage: ./bench.sh [OPTIONS]
  8 | # OPTIONS:
  9 | #   -p, --prompt        Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')
 10 | #   -r, --repetitions   Number of repetitions for benchmarks (default: 10)
 11 | #   -m, --max_tokens    Maximum number of tokens for benchmarks (default: 512)
 12 | #   -d, --device        Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')
 13 | #   -n, --model_name    The name of the model to benchmark (possible values: 'llama' for using Llama2, 'mistral' for using Mistral 7B v0.1)
 14 | #   -lf, --log_file     Logging file name.
 15 | #   -h, --help          Show this help message
 16 | ########################################################################################################
 17 | 
 18 | set -euo pipefail
 19 | 
 20 | print_usage() {
 21 |     echo "Usage: $0 [OPTIONS]"
 22 |     echo "OPTIONS:"
 23 |     echo "  -p, --prompt        Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')"
 24 |     echo "  -r, --repetitions   Number of repetitions for benchmarks (default: 10)"
 25 |     echo "  -m, --max_tokens    Maximum number of tokens for benchmarks (default: 512)"
 26 |     echo "  -d, --device        Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')"
 27 |     echo "  -n, --model_name    The name of the model to benchmark (possible values: 'llama' for using Llama2, 'mistral' for using Mistral 7B v0.1)"
 28 |     echo "  -lf, --log_file     Logging file name."
 29 |     echo "  -h, --help          Show this help message"
 30 |     exit 1
 31 | }
 32 | 
 33 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 34 | 
 35 | check_cuda() {
 36 |     if command -v nvcc &> /dev/null
 37 |     then
 38 |         echo -e "\nUsing CUDA"
 39 |         nvcc --version
 40 |     else
 41 |         echo -e "\nCUDA is not available."
 42 |         exit 1
 43 |     fi
 44 | }
 45 | 
 46 | check_platform() {
 47 |     local platform
 48 |     platform=$(uname -s)
 49 |     if [[ "$platform" == "Linux" ]]; then
 50 |         echo "Running on Linux."
 51 |     elif [[ "$platform" == "Darwin" ]]; then
 52 |         echo "Running on Mac OS."
 53 |     else
 54 |         echo "Unknown platform."
 55 |         exit 1
 56 |     fi
 57 | }
 58 | 
 59 | check_python() {
 60 |     if command -v python &> /dev/null; then
 61 |         PYTHON_CMD="python"
 62 |     elif command -v python3 &> /dev/null; then
 63 |         PYTHON_CMD="python3"
 64 |     else
 65 |         echo "Python is not installed."
 66 |         exit 1
 67 |     fi
 68 | }
 69 | 
 70 | 
 71 | setup() {
 72 |     local DEVICE="$1"
 73 |     local MODEL_NAME="${2:-llama}"
 74 |     echo -e "\nSetting up with $SCRIPT_DIR/setup.sh..."
 75 |     bash "$SCRIPT_DIR/setup.sh" "$DEVICE" "$MODEL_NAME"
 76 | }
 77 | 
 78 | run_benchmarks() {
 79 |     local PROMPT="$1"
 80 |     local REPETITIONS="$2"
 81 |     local MAX_TOKENS="$3"
 82 |     local DEVICE="$4"
 83 |     local MODEL_NAME="$5"
 84 | 
 85 |     # shellcheck disable=SC1091
 86 |     source "$SCRIPT_DIR/venv/bin/activate"
 87 |     "$PYTHON_CMD" "$SCRIPT_DIR"/bench.py \
 88 |         --prompt "$PROMPT" \
 89 |         --repetitions "$REPETITIONS" \
 90 |         --max_tokens "$MAX_TOKENS" \
 91 |         --model_name "$MODEL_NAME" \
 92 |         --device "$DEVICE"
 93 | }
 94 | 
 95 | # Parse command-line arguments
 96 | while [ "$#" -gt 0 ]; do
 97 |     case "$1" in
 98 |         -p|--prompt)
 99 |             PROMPT="$2"
100 |             shift 2
101 |             ;;
102 |         -r|--repetitions)
103 |             REPETITIONS="$2"
104 |             shift 2
105 |             ;;
106 |         -m|--max_tokens)
107 |             MAX_TOKENS="$2"
108 |             shift 2
109 |             ;;
110 |         -d|--device)
111 |             DEVICE="$2"
112 |             case "$DEVICE" in
113 |                 "cuda" | "metal" | "cpu")
114 |                     ;;
115 |                 *)
116 |                     echo "Invalid value for --device. Please use 'cuda', 'cpu' or 'metal'."
117 |                     print_usage
118 |                     ;;
119 |             esac
120 |             if [ "$DEVICE" == "cuda" ]; then
121 |                 check_cuda
122 |             else
123 |                 echo "Not supported for $DEVICE"
124 |                 exit 1
125 |             fi
126 |             shift 2
127 |             ;;
128 |         -n|--model_name)
129 |             MODEL_NAME="$2"
130 |             shift 2
131 |             ;;
132 |         -h|--help)
133 |             print_usage
134 |             ;;
135 |         *)
136 |             echo "Unknown option: $1"
137 |             print_usage
138 |             ;;
139 |     esac
140 | done
141 | 
142 | check_platform
143 | check_python
144 | 
145 | # Set default values if not provided
146 | PROMPT="${PROMPT:-"Write an essay about the transformer model architecture"}"
147 | REPETITIONS="${REPETITIONS:-10}"
148 | MAX_TOKENS="${MAX_TOKENS:-512}"
149 | DEVICE="${DEVICE:-'cuda'}"
150 | MODEL_NAME="${MODEL_NAME:-"llama"}"
151 | 
152 | setup "$DEVICE" "$MODEL_NAME"
153 | run_benchmarks "$PROMPT" "$REPETITIONS" "$MAX_TOKENS" "$DEVICE" "$MODEL_NAME"
154 | 


--------------------------------------------------------------------------------
/bench_vllm/setup.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | ################################################################################
  4 | # Script: setup.sh <DEVICE>
  5 | # Description: Automates the setup of a virtual environment and installs project
  6 | # requirements.
  7 | ################################################################################
  8 | 
  9 | set -euo pipefail
 10 | 
 11 | CURRENT_DIR="$(pwd)"
 12 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 13 | 
 14 | # Set default folder paths for AWQ weights
 15 | LLAMA2_AWQ_WEIGHTS_FOLDER="$CURRENT_DIR/models/llama-2-7b-chat-autoawq"
 16 | MISTRAL_AWQ_WEIGHTS_FOLDER="$CURRENT_DIR/models/mistral-7b-v0.1-instruct-autoawq"
 17 | 
 18 | 
 19 | check_python() {
 20 |     if command -v python &> /dev/null; then
 21 |         PYTHON_CMD="python"
 22 |     elif command -v python3 &> /dev/null; then
 23 |         PYTHON_CMD="python3"
 24 |     else
 25 |         echo "Python is not installed."
 26 |         exit 1
 27 |     fi
 28 | }
 29 | 
 30 | check_python
 31 | 
 32 | install_vllm_cuda() {
 33 |     CUDA_VERSION=$(nvcc --version | grep "release" | sed -n 's/.*release \(.*\),.*/\1/p')
 34 | 
 35 |     if [ -z "$CUDA_VERSION" ]; then
 36 |         echo "CUDA is not installed or not found."
 37 |         exit 1
 38 |     fi
 39 | 
 40 |     CUDA_MAJOR=$(echo "$CUDA_VERSION" | cut -d. -f1)
 41 |     CUDA_MINOR=$(echo "$CUDA_VERSION" | cut -d. -f2)
 42 | 
 43 |    if [ "$CUDA_MAJOR" -ge 12 ] || { [ "$CUDA_MAJOR" -eq 12 ] && [ "$CUDA_MINOR" -ge 0 ]; }; then
 44 |         echo "Detected CUDA version >= 12.2"
 45 |         "$PYTHON_CMD" -m pip install vllm==0.4.0 transformers==4.39.2
 46 |     else
 47 |         echo "Detected CUDA version < 12.2"
 48 |         PY_VERSION=$(get_python_version)
 49 |         if [ -z "$PY_VERSION" ]; then
 50 |             echo "Python version not found."
 51 |             exit 1
 52 |         fi
 53 |         echo "Installing vllm for CUDA 11.8 with Python version: $PY_VERSION"
 54 |         # Download vllm for CUDA 11.8 and specified Python version
 55 |         "$PYTHON_CMD" -m pip install https://github.com/vllm-project/vllm/releases/download/v0.2.2/vllm-0.2.2+cu118-"$PY_VERSION"-"$PY_VERSION"-manylinux1_x86_64.whl
 56 |         "$PYTHON_CMD" -m pip install torch --upgrade --index-url https://download.pytorch.org/whl/cu118
 57 |         "$PYTHON_CMD" -m pip install huggingface-cli==0.1 transformers==4.39.2
 58 |     fi
 59 | }
 60 | 
 61 | get_python_version() {
 62 |     # Fetch Python version
 63 |     PY_VER=$("$PYTHON_CMD" -c 'import sys; print(".".join(map(str, sys.version_info[:2])))')
 64 | 
 65 |     case $PY_VER in
 66 |         3.10) echo "cp310";;
 67 |         3.8) echo "cp38";;
 68 |         3.9) echo "cp39";;
 69 |         3.11) echo "cp311";;
 70 |         *) echo "Unknown Python version"; exit 1;;
 71 |     esac
 72 | }
 73 | 
 74 | 
 75 | install_device_specific_vllm() {
 76 |     local DEVICE="$1"
 77 | 
 78 |     if [ "$#" -ne 1 ]; then
 79 |         echo "Usage: $0 <DEVICE>"
 80 |         exit 1
 81 |     fi
 82 | 
 83 |     case "$DEVICE" in
 84 |         cuda)
 85 |             echo "Installing VLLM for CUDA."
 86 |             install_vllm_cuda
 87 |             ;;
 88 |         metal)
 89 |             echo "VLLM for metal is not supported yet."
 90 |             echo "For more information, checkout this issue: https://github.com/vllm-project/vllm/issues/1441"
 91 |             return 1
 92 |             ;;
 93 |         cpu)
 94 |             echo "VLLM for CPU is not supported yet."
 95 |             echo "For more information, checkout this issue: https://github.com/vllm-project/vllm/issues/176"
 96 |             ;;
 97 |         *)
 98 |             echo "Unsupported DEVICE: $DEVICE"
 99 |             return 1
100 |             ;;
101 |     esac
102 | }
103 | 
104 | download_awq_weights() {
105 |     local MODEL_NAME="$1"
106 | 
107 |     # Set download directory based on MODEL_NAME
108 |     if [ "$MODEL_NAME" = "llama" ]; then
109 |         DOWNLOAD_DIR="$LLAMA2_AWQ_WEIGHTS_FOLDER"
110 |         MODEL_IDENTIFIER="TheBloke/Llama-2-7B-Chat-AWQ"
111 |     elif [ "$MODEL_NAME" = "mistral" ]; then
112 |         DOWNLOAD_DIR="$MISTRAL_AWQ_WEIGHTS_FOLDER"
113 |         MODEL_IDENTIFIER="TheBloke/Mistral-7B-Instruct-v0.1-AWQ"
114 |     else
115 |         echo "Invalid MODEL_NAME. Supported values: 'llama', 'mistral'"
116 |         exit 1
117 |     fi
118 | 
119 |     # Check if weights folder exists
120 |     echo "$DOWNLOAD_DIR"
121 | 
122 |     if [ ! -d "$DOWNLOAD_DIR" ]; then
123 |         # Download weights using huggingface-cli
124 |         echo "Downloading weights to $DOWNLOAD_DIR..."
125 |         huggingface-cli download "$MODEL_IDENTIFIER" --local-dir "$DOWNLOAD_DIR" --exclude "*.git*" "*.md" "Notice" "LICENSE"
126 |     else
127 |         echo "Weights already downloaded"
128 |     fi
129 | }
130 | 
131 | 
132 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
133 | VENV_DIR="$SCRIPT_DIR/venv"
134 | 
135 | DEVICE="$1"
136 | MODEL_NAME="$2"
137 | 
138 | 
139 | # Build and activate the virtual environment.
140 | 
141 | if [ ! -d "$VENV_DIR" ]; then
142 |     "$PYTHON_CMD" -m venv "$VENV_DIR"
143 |     echo "Virtual environment '$VENV_DIR' created."
144 |     # shellcheck disable=SC1091
145 |     source "$VENV_DIR/bin/activate"
146 |     "$PYTHON_CMD" -m pip install --upgrade pip > /dev/null
147 |     install_device_specific_vllm "$DEVICE"
148 | else
149 |     # shellcheck disable=SC1091
150 |     source "$VENV_DIR/bin/activate"
151 | fi
152 | 
153 | download_awq_weights "$MODEL_NAME"
154 | 


--------------------------------------------------------------------------------
/bench_llamacpp/bench.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | ########################################################################################################
  4 | # Script: bench.sh
  5 | # Description: This script runs benchmarks LlamaCPP llama benchmark.
  6 | #
  7 | # Usage: ./bench.sh [OPTIONS]
  8 | # OPTIONS:
  9 | #   -p, --prompt        Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')
 10 | #   -r, --repetitions   Number of repetitions for benchmarks (default: 10)
 11 | #   -m, --max_tokens    Maximum number of tokens for benchmarks (default: 512)
 12 | #   -d, --device        Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')
 13 | #   -n, --model_name    The name of the model to benchmark (possible values: 'llama' for using Llama2, 'mistral' for using Mistral 7B v0.1)
 14 | #   -lf, --log_file     Logging file name.
 15 | #   -h, --help          Show this help message
 16 | ########################################################################################################
 17 | 
 18 | set -euo pipefail
 19 | 
 20 | print_usage() {
 21 |     echo "Usage: $0 [OPTIONS]"
 22 |     echo "OPTIONS:"
 23 |     echo "  -p, --prompt        Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')"
 24 |     echo "  -r, --repetitions   Number of repetitions for benchmarks (default: 10)"
 25 |     echo "  -m, --max_tokens    Maximum number of tokens for benchmarks (default: 512)"
 26 |     echo "  -d, --device        Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')"
 27 |     echo "  -n, --model_name    The name of the model to benchmark (possible values: 'llama' for using Llama2, 'mistral' for using Mistral 7B v0.1)"
 28 |     echo "  -lf, --log_file     Logging file name."
 29 |     echo "  -h, --help          Show this help message"
 30 |     exit 1
 31 | }
 32 | 
 33 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 34 | 
 35 | check_cuda() {
 36 |     if command -v nvcc &> /dev/null
 37 |     then
 38 |         echo -e "\nUsing CUDA"
 39 |         nvcc --version
 40 |     else
 41 |         echo -e "\nCUDA is not available."
 42 |         exit 1
 43 |     fi
 44 | }
 45 | 
 46 | check_platform() {
 47 |     local platform
 48 |     platform=$(uname -s)
 49 |     if [[ "$platform" == "Linux" ]]; then
 50 |         echo "Running on Linux."
 51 |     elif [[ "$platform" == "Darwin" ]]; then
 52 |         echo "Running on Mac OS."
 53 |     else
 54 |         echo "Unknown platform."
 55 |         exit 1
 56 |     fi
 57 | }
 58 | 
 59 | check_python() {
 60 |     if command -v python &> /dev/null; then
 61 |         PYTHON_CMD="python"
 62 |     elif command -v python3 &> /dev/null; then
 63 |         PYTHON_CMD="python3"
 64 |     else
 65 |         echo "Python is not installed."
 66 |         exit 1
 67 |     fi
 68 | }
 69 | 
 70 | setup() {
 71 |     local DEVICE="$1"
 72 |     local MODEL_NAME="$2"
 73 |     echo -e "\nSetting up with $SCRIPT_DIR/setup.sh..."
 74 |     bash "$SCRIPT_DIR/setup.sh" "$DEVICE" "$MODEL_NAME"
 75 | }
 76 | 
 77 | run_benchmarks() {
 78 |     local PROMPT="$1"
 79 |     local REPETITIONS="$2"
 80 |     local MAX_TOKENS="$3"
 81 |     local DEVICE="$4"
 82 |     local MODEL_NAME="$5"
 83 | 
 84 |     if [ "$DEVICE" == "cuda" ] || [ "$DEVICE" == "metal" ]; then
 85 |         export LLAMA_CPP_LIB=$SCRIPT_DIR/venv/libllama_$DEVICE.so
 86 |         echo "LLAMA_CPP_LIB=$LLAMA_CPP_LIB"
 87 |     fi
 88 | 
 89 |     # shellcheck disable=SC1091
 90 |     source "$SCRIPT_DIR/venv/bin/activate"
 91 |     "$PYTHON_CMD" "$SCRIPT_DIR"/bench.py \
 92 |         --prompt "$PROMPT" \
 93 |         --repetitions "$REPETITIONS" \
 94 |         --max_tokens "$MAX_TOKENS" \
 95 |         --model_name "$MODEL_NAME" \
 96 |         --device "$DEVICE"
 97 | }
 98 | 
 99 | while [ "$#" -gt 0 ]; do
100 |     case "$1" in
101 |         -p|--prompt)
102 |             PROMPT="$2"
103 |             shift 2
104 |             ;;
105 |         -r|--repetitions)
106 |             REPETITIONS="$2"
107 |             shift 2
108 |             ;;
109 |         -m|--max_tokens)
110 |             MAX_TOKENS="$2"
111 |             shift 2
112 |             ;;
113 |         -d|--device)
114 |             DEVICE="$2"
115 |             case "$DEVICE" in
116 |                 "cuda" | "metal" | "cpu")
117 |                     ;;
118 |                 *)
119 |                     echo "Invalid value for --device. Please use 'cuda', 'cpu' or 'metal'."
120 |                     print_usage
121 |                     ;;
122 |             esac
123 |             if [ "$DEVICE" == "cuda" ]; then
124 |                 check_cuda
125 |             else
126 |                 echo "Not supported for $DEVICE"
127 |                 exit 1
128 |             fi
129 |             shift 2
130 |             ;;
131 |         -n|--model_name)
132 |             MODEL_NAME="$2"
133 |             shift 2
134 |             ;;
135 |         -h|--help)
136 |             print_usage
137 |             ;;
138 |         *)
139 |             echo "Unknown option: $1"
140 |             print_usage
141 |             ;;
142 |     esac
143 | done
144 | 
145 | check_platform
146 | check_python
147 | 
148 | # Set default values if not provided
149 | PROMPT="${PROMPT:-"Write an essay about the transformer model architecture"}"
150 | REPETITIONS="${REPETITIONS:-10}"
151 | MAX_TOKENS="${MAX_TOKENS:-512}"
152 | DEVICE="${DEVICE:-'cuda'}"
153 | MODEL_NAME="${MODEL_NAME:-"llama"}"
154 | 
155 | setup "$DEVICE"
156 | run_benchmarks "$PROMPT" "$REPETITIONS" "$MAX_TOKENS" "$DEVICE" "$MODEL_NAME"
157 | 


--------------------------------------------------------------------------------
/bench_autogptq/bench.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import sys
  4 | 
  5 | import torch
  6 | from auto_gptq import AutoGPTQForCausalLM
  7 | from transformers import AutoTokenizer
  8 | 
  9 | sys.path.append(os.getcwd())
 10 | 
 11 | from common.base import BaseBenchmarkClass  # noqa
 12 | from common.utils import launch_cli, make_report  # noqa
 13 | 
 14 | _MESSAGE = """
 15 | GPTQ adopts a mixed int4/fp16 quantization scheme where weights are quantized as int4 while activations remain
 16 | in float16. During inference, weights are dequantized on the fly and the actual compute is performed in float16.
 17 | """
 18 | 
 19 | 
 20 | class AutoGPTQBenchmark(BaseBenchmarkClass):
 21 |     def __init__(
 22 |         self,
 23 |         model_path: str,
 24 |         model_name: str,
 25 |         benchmark_name: str,
 26 |         precision: str,
 27 |         device: str,
 28 |         experiment_name: str,
 29 |     ) -> None:
 30 |         super().__init__(
 31 |             model_name=model_name,
 32 |             model_path=model_path,
 33 |             benchmark_name=benchmark_name,
 34 |             experiment_name=experiment_name,
 35 |             precision=precision,
 36 |             device=device,
 37 |         )
 38 | 
 39 |         if model_name == "llama":
 40 |             self.tokenizer_folder = os.path.join(
 41 |                 os.getcwd(), "models", "llama-2-7b-chat-hf"
 42 |             )
 43 |         else:
 44 |             self.tokenizer_folder = os.path.join(
 45 |                 os.getcwd(), "models", "mistral-7b-v0.1-instruct-hf"
 46 |             )
 47 | 
 48 |         self.precision_map = {"float16": torch.float16, "float32": torch.float32}
 49 | 
 50 |     def load_model_and_tokenizer(self):
 51 |         device = "cuda:0" if self.device == "cuda" else self.device
 52 | 
 53 |         if self.model_name == "llama":
 54 |             if self.precision == "float16":
 55 |                 use_marlin = True
 56 |             else:
 57 |                 use_marlin = False
 58 |         else:
 59 |             use_marlin = False
 60 | 
 61 |         self.model = AutoGPTQForCausalLM.from_quantized(
 62 |             self.model_path,
 63 |             device=device,
 64 |             use_marlin=use_marlin,
 65 |             torch_dtype=self.precision_map[self.precision],
 66 |         )
 67 |         self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_folder)
 68 |         return self
 69 | 
 70 |     def preprocess(self, prompt: str, chat_mode: bool = True, for_benchmarks=True):
 71 |         if chat_mode:
 72 |             template = self.get_chat_template_with_instruction(
 73 |                 prompt=prompt, for_benchmarks=for_benchmarks
 74 |             )
 75 |             prompt = self.tokenizer.apply_chat_template(template, tokenize=False)
 76 | 
 77 |         tokenized_input = self.tokenizer.encode(text=prompt)
 78 |         tensor = self.tokenizer.encode(prompt, return_tensors="pt").to(self.device)
 79 | 
 80 |         return {
 81 |             "prompt": prompt,
 82 |             "input_tokens": tokenized_input,
 83 |             "tensor": tensor,
 84 |             "num_input_tokens": len(tokenized_input),
 85 |         }
 86 | 
 87 |     def run_model(self, inputs: dict, max_tokens: int, temperature: float) -> dict:
 88 |         tensor = inputs["tensor"]
 89 |         num_input_tokens = inputs["num_input_tokens"]
 90 | 
 91 |         output = (
 92 |             self.model.generate(
 93 |                 input_ids=tensor,
 94 |                 max_new_tokens=max_tokens,
 95 |                 temperature=temperature,
 96 |                 do_sample=True,
 97 |             )
 98 |             .detach()
 99 |             .tolist()[0]
100 |         )
101 | 
102 |         output_tokens = (
103 |             output[num_input_tokens:] if len(output) > num_input_tokens else output
104 |         )
105 |         return {"output_tokens": output_tokens, "num_output_tokens": len(output_tokens)}
106 | 
107 |     def postprocess(self, output: dict) -> str:
108 |         output_tokens = output["output_tokens"]
109 |         return self.tokenizer.decode(output_tokens, skip_special_tokens=True)
110 | 
111 |     def on_exit(self):
112 |         if self.device == "cuda:0":
113 |             del self.model
114 |             torch.cuda.synchronize()
115 |         else:
116 |             del self.model
117 | 
118 | 
119 | if __name__ == "__main__":
120 |     parser = launch_cli(description="AutoGPTQ Benchmark.")
121 |     args = parser.parse_args()
122 | 
123 |     model_folder = os.path.join(os.getcwd(), "models")
124 |     model_name = (
125 |         f"{args.model_name}-2-7b-chat-autogptq"
126 |         if args.model_name == "llama"
127 |         else f"{args.model_name}-7b-v0.1-instruct-autogptq"
128 |     )
129 |     logging.info(_MESSAGE)
130 | 
131 |     runner_dict = {
132 |         "cuda": [
133 |             {
134 |                 "precision": "float16",
135 |                 "model_path": os.path.join(model_folder, model_name),
136 |             },
137 |             {
138 |                 "precision": "float32",
139 |                 "model_path": os.path.join(model_folder, model_name),
140 |             },
141 |         ]
142 |     }
143 | 
144 |     if args.device == "cpu":
145 |         logging.info("Skipping running model on int4 on CPU, not implemented for Half")
146 |         pass
147 |     else:
148 |         make_report(
149 |             args=args,
150 |             benchmark_class=AutoGPTQBenchmark,
151 |             runner_dict=runner_dict,
152 |             benchmark_name="AutoGPTQ",
153 |             is_bench_pytorch=False,
154 |         )
155 | 


--------------------------------------------------------------------------------
/bench_onnxruntime/bench.py:
--------------------------------------------------------------------------------
  1 | import gc
  2 | import os
  3 | import sys
  4 | import time
  5 | 
  6 | import torch
  7 | from onnxruntime import InferenceSession
  8 | from optimum.onnxruntime import ORTModelForCausalLM
  9 | from transformers import AutoConfig, AutoTokenizer
 10 | 
 11 | sys.path.append("/mnt")
 12 | sys.path.append("/mnt/benchmarks/")
 13 | 
 14 | from common.base import BaseBenchmarkClass  # noqa
 15 | from common.utils import launch_cli, make_report  # noqa
 16 | 
 17 | 
 18 | class ONNXOptimumBenchmark(BaseBenchmarkClass):
 19 |     def __init__(
 20 |         self,
 21 |         model_path: str,
 22 |         model_name: str,
 23 |         benchmark_name: str,
 24 |         precision: str,
 25 |         device: str,
 26 |         experiment_name: str,
 27 |     ) -> None:
 28 |         assert precision in ["float32", "float16"], ValueError(
 29 |             "Supported precision: 'float32' and 'float16'"
 30 |         )
 31 |         assert device in ["cuda"], ValueError(
 32 |             "Current implement is only supported for device = 'cuda'"
 33 |         )
 34 |         super().__init__(
 35 |             model_name=model_name,
 36 |             model_path=model_path,
 37 |             benchmark_name=benchmark_name,
 38 |             experiment_name=experiment_name,
 39 |             precision=precision,
 40 |             device=device,
 41 |             root_folder="/mnt/benchmarks",
 42 |         )
 43 | 
 44 |         if model_name == "llama":
 45 |             self.tokenizer_folder = os.path.join(
 46 |                 self.root_folder, "models", "llama-2-7b-chat-hf"
 47 |             )
 48 |         else:
 49 |             self.tokenizer_folder = os.path.join(
 50 |                 self.root_folder, "models", "mistral-7b-v0.1-instruct-hf"
 51 |             )
 52 | 
 53 |     def load_model_and_tokenizer(self):
 54 |         start_time = time.perf_counter()
 55 |         onnx_path = os.path.join(self.model_path, "model.onnx")
 56 |         config = AutoConfig.from_pretrained(self.model_path)
 57 | 
 58 |         # load the session and the model
 59 |         self.session = InferenceSession(onnx_path, providers=["CUDAExecutionProvider"])
 60 |         self.model = ORTModelForCausalLM(
 61 |             self.session, config, use_cache=False, use_io_binding=False
 62 |         )
 63 |         self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_folder)
 64 |         delta = time.perf_counter() - start_time
 65 |         self.logger.info(f"Model Loading time took: {delta:.2f} seconds")
 66 |         return self
 67 | 
 68 |     def preprocess(
 69 |         self, prompt: str, chat_mode: bool = True, for_benchmarks: bool = True
 70 |     ):
 71 |         if chat_mode:
 72 |             template = self.get_chat_template_with_instruction(
 73 |                 prompt=prompt, for_benchmarks=for_benchmarks
 74 |             )
 75 |             prompt = self.tokenizer.apply_chat_template(template, tokenize=False)
 76 | 
 77 |         tokenized_input = self.tokenizer.encode(text=prompt)
 78 |         tensor = self.tokenizer(prompt, return_tensors="pt").to(self.device)
 79 |         return {
 80 |             "prompt": prompt,
 81 |             "input_tokens": tokenized_input,
 82 |             "tensor": tensor,
 83 |             "num_input_tokens": len(tokenized_input),
 84 |         }
 85 | 
 86 |     @torch.inference_mode(mode=True)
 87 |     def run_model(self, inputs: dict, max_tokens: int, temperature: float) -> dict:
 88 |         tensor = inputs["tensor"]
 89 |         num_input_tokens = inputs["num_input_tokens"]
 90 | 
 91 |         generated = self.model.generate(
 92 |             **tensor,
 93 |             do_sample=True,
 94 |             temperature=temperature,
 95 |             max_new_tokens=max_tokens,
 96 |             top_p=0.1,
 97 |             pad_token_id=self.tokenizer.eos_token_id,
 98 |             eos_token_id=self.tokenizer.eos_token_id,
 99 |         )
100 | 
101 |         output_tokens = generated[0].detach().tolist()[num_input_tokens:]
102 |         return {"output_tokens": output_tokens, "num_output_tokens": len(output_tokens)}
103 | 
104 |     def postprocess(self, output: dict) -> str:
105 |         output_tokens = output["output_tokens"]
106 |         output_text = self.tokenizer.decode(output_tokens, skip_special_tokens=True)
107 |         return output_text
108 | 
109 |     def on_exit(self):
110 |         if self.device in ["cuda", "cuda:0"]:
111 |             del self.model
112 |             del self.session
113 |             torch.cuda.synchronize()
114 |             gc.collect()
115 |         else:
116 |             del self.model
117 |             del self.session
118 | 
119 | 
120 | if __name__ == "__main__":
121 |     parser = launch_cli(description="ONNX HF-Optimum Benchmark.")
122 |     args = parser.parse_args()
123 | 
124 |     model_folder = "/mnt/benchmarks/models"
125 |     model_name = (
126 |         f"{args.model_name}-2-7b-chat-onnx"
127 |         if args.model_name == "llama"
128 |         else f"{args.model_name}-7b-v0.1-instruct-onnx"
129 |     )
130 | 
131 |     runner_dict = {
132 |         "cuda": [
133 |             {
134 |                 "precision": "float32",
135 |                 "model_path": os.path.join(model_folder, model_name + "-float32"),
136 |             },
137 |             {
138 |                 "precision": "float16",
139 |                 "model_path": os.path.join(model_folder, model_name + "-float16"),
140 |             },
141 |         ]
142 |     }
143 | 
144 |     make_report(
145 |         args=args,
146 |         benchmark_class=ONNXOptimumBenchmark,
147 |         runner_dict=runner_dict,
148 |         benchmark_name="ONNX-HF-Optimum",
149 |         is_bench_pytorch=False,
150 |     )
151 | 


--------------------------------------------------------------------------------
/bench_tensorrtllm/bench.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | 
  4 | import tensorrt_llm
  5 | import torch
  6 | from tensorrt_llm.runtime import ModelRunnerCpp
  7 | from transformers import AutoTokenizer
  8 | 
  9 | sys.path.append("/mnt")
 10 | sys.path.append("/mnt/benchmarks/")
 11 | 
 12 | from common.base import BaseBenchmarkClass  # noqa
 13 | from common.utils import launch_cli, make_report  # noqa
 14 | 
 15 | 
 16 | class TensorRTLLMBenchmark(BaseBenchmarkClass):
 17 |     def __init__(
 18 |         self,
 19 |         model_path: str,
 20 |         model_name: str,
 21 |         benchmark_name: str,
 22 |         precision: str,
 23 |         device: str,
 24 |         experiment_name: str,
 25 |     ) -> None:
 26 |         super().__init__(
 27 |             model_name=model_name,
 28 |             model_path=model_path,
 29 |             benchmark_name=benchmark_name,
 30 |             experiment_name=experiment_name,
 31 |             precision=precision,
 32 |             device=device,
 33 |             root_folder="/mnt/benchmarks",
 34 |         )
 35 |         self.runtime_rank = tensorrt_llm.mpi_rank()
 36 |         if model_name == "llama":
 37 |             self.tokenizer_folder = os.path.join(
 38 |                 self.root_folder, "models", "llama-2-7b-chat-hf"
 39 |             )
 40 |         else:
 41 |             self.tokenizer_folder = os.path.join(
 42 |                 self.root_folder, "models", "mistral-7b-v0.1-instruct-hf"
 43 |             )
 44 | 
 45 |     def load_model_and_tokenizer(self):
 46 |         self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_folder)
 47 |         if self.tokenizer.pad_token_id is None:
 48 |             self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
 49 |         self.pad_id = self.tokenizer.pad_token_id
 50 |         self.end_id = self.tokenizer.eos_token_id
 51 | 
 52 |         # load the runner kawargs
 53 |         runner_kwargs = dict(
 54 |             engine_dir=self.model_path,
 55 |             rank=self.runtime_rank,
 56 |             max_batch_size=1,
 57 |             max_input_len=512,
 58 |             max_output_len=512,
 59 |             max_beam_width=1,
 60 |             max_attention_window_size=None,
 61 |             sink_token_length=None,
 62 |         )
 63 |         self.model = ModelRunnerCpp.from_dir(**runner_kwargs)
 64 |         return self
 65 | 
 66 |     def preprocess(
 67 |         self, prompt: str, chat_mode: bool = True, for_benchmarks: bool = True
 68 |     ):
 69 |         if chat_mode:
 70 |             template = self.get_chat_template_with_instruction(
 71 |                 prompt=prompt, for_benchmarks=for_benchmarks
 72 |             )
 73 |             prompt = self.tokenizer.apply_chat_template(template, tokenize=False)
 74 | 
 75 |         tokenized_input = self.tokenizer.encode(text=prompt)
 76 |         tensor = self.tokenizer.encode(
 77 |             prompt, return_tensors="pt", truncation=True
 78 |         ).squeeze(0)
 79 |         return {
 80 |             "prompt": prompt,
 81 |             "input_tokens": tokenized_input,
 82 |             "tensor": [tensor],
 83 |             "num_input_tokens": len(tokenized_input),
 84 |         }
 85 | 
 86 |     def run_model(self, inputs: dict, max_tokens: int, temperature: float) -> dict:
 87 |         tensor = inputs["tensor"]
 88 |         num_input_tokens = inputs["num_input_tokens"]
 89 | 
 90 |         with torch.no_grad():
 91 |             output = self.model.generate(
 92 |                 tensor,
 93 |                 max_new_tokens=max_tokens,
 94 |                 temperature=temperature,
 95 |                 pad_id=self.pad_id,
 96 |                 end_id=self.end_id,
 97 |                 return_dict=True,
 98 |             )
 99 | 
100 |         output_ids = output["output_ids"]
101 |         output_tokens = output_ids[0][0].detach().cpu().tolist()[num_input_tokens:]
102 | 
103 |         return {
104 |             "output_tokens": output_tokens,
105 |             "num_output_tokens": len(output_tokens),
106 |         }
107 | 
108 |     def postprocess(self, output: dict) -> str:
109 |         output_tokens = output["output_tokens"]
110 |         output_text = self.tokenizer.decode(output_tokens, skip_special_tokens=True)
111 |         return output_text
112 | 
113 |     def on_exit(self):
114 |         del self.model
115 |         torch.cuda.synchronize()
116 | 
117 | 
118 | if __name__ == "__main__":
119 |     parser = launch_cli(description="Nvidia TRT-LLM Benchmark.")
120 |     args = parser.parse_args()
121 | 
122 |     model_folder = "/mnt/benchmarks/models"
123 |     model_name = (
124 |         f"{args.model_name}-2-7b-chat-trt"
125 |         if args.model_name == "llama"
126 |         else f"{args.model_name}-7b-v0.1-instruct-trt"
127 |     )
128 | 
129 |     runner_dict = {
130 |         "cuda": [
131 |             {
132 |                 "precision": "float32",
133 |                 "model_path": os.path.join(model_folder, model_name + "-float32"),
134 |             },
135 |             {
136 |                 "precision": "float16",
137 |                 "model_path": os.path.join(model_folder, model_name + "-float16"),
138 |             },
139 |             {
140 |                 "precision": "int8",
141 |                 "model_path": os.path.join(model_folder, model_name + "-int8"),
142 |             },
143 |             {
144 |                 "precision": "int4",
145 |                 "model_path": os.path.join(model_folder, model_name + "-int4"),
146 |             },
147 |         ]
148 |     }
149 | 
150 |     make_report(
151 |         args=args,
152 |         benchmark_class=TensorRTLLMBenchmark,
153 |         runner_dict=runner_dict,
154 |         benchmark_name="Nvidia-TRT-LLM",
155 |         is_bench_pytorch=False,
156 |     )
157 | 


--------------------------------------------------------------------------------
/bench_candle/bench.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | ########################################################################################################
  4 | # Script: bench.sh
  5 | # Description: This script runs benchmarks candle llama benchmark.
  6 | #
  7 | # Usage: ./bench.sh [OPTIONS]
  8 | # OPTIONS:
  9 | #   -p, --prompt      Prompt for benchmarks (default: 'Explain what is a transformer')
 10 | #   -r, --repetitions Number of repetitions for benchmarks (default: 2)
 11 | #   -m, --max_tokens  Maximum number of tokens for benchmarks (default: 100)
 12 | #   -d, --device      Device for benchmarks (possible values: 'metal', 'gpu', and 'cpu', default: 'cpu')
 13 | #   -lf, --log_file   Logging file name.
 14 | #   -md, --models_dir Models directory.
 15 | #   -h, --help        Show this help message
 16 | ########################################################################################################
 17 | 
 18 | set -euo pipefail
 19 | 
 20 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 21 | 
 22 | print_usage() {
 23 |     echo "Usage: $0 [OPTIONS]"
 24 |     echo "OPTIONS:"
 25 |     echo "  -p, --prompt        Prompt for benchmarks (default: 'Explain what is a transformer')"
 26 |     echo "  -r, --repetitions   Number of repetitions for benchmarks (default: 2)"
 27 |     echo "  -m, --max_tokens    Maximum number of tokens for benchmarks (default: 100)"
 28 |     echo "  -d, --device        Device for benchmarks (possible values: 'metal', 'gpu', and 'cpu', default: 'cpu')"
 29 |     echo "  -lf, --log_file     Logging file name."
 30 |     echo "  -md, --models_dir   Models directory."
 31 |     echo "  -h, --help          Show this help message"
 32 |     exit 1
 33 | }
 34 | 
 35 | check_cuda() {
 36 |     if command -v nvcc &> /dev/null
 37 |     then
 38 |         echo -e "\nUsing CUDA"
 39 |         nvcc --version
 40 |     else
 41 |         echo -e "\nCUDA is not available."
 42 |         exit 1
 43 |     fi
 44 | }
 45 | 
 46 | check_rust() {
 47 |     if which cargo &>/dev/null ; then
 48 |         echo -e "\nRust is installed. Using $(which cargo)"
 49 |     else
 50 |         echo -e "\nRust is not installed. Please install Rust before proceeding."
 51 |         exit 1  # Error exit code
 52 |     fi
 53 | }
 54 | 
 55 | check_platform() {
 56 |     local platform
 57 |     platform=$(uname -s)
 58 |     if [[ "$platform" == "Linux" ]]; then
 59 |         echo "Running on Linux."
 60 |     elif [[ "$platform" == "Darwin" ]]; then
 61 |         echo "Running on Mac OS."
 62 |     else
 63 |         echo "Unknown platform."
 64 |         exit 1
 65 |     fi
 66 | }
 67 | 
 68 | check_python() {
 69 |     if command -v python &> /dev/null
 70 |     then
 71 |         echo -e "\nUsing $(python --version)."
 72 |     else
 73 |         echo -e "\nPython does not exist."
 74 |         exit 1
 75 |     fi
 76 | }
 77 | 
 78 | setup() {
 79 |     echo -e "\nSetting up with $SCRIPT_DIR/setup.sh..."
 80 |     bash "$SCRIPT_DIR/setup.sh" "$1"
 81 | }
 82 | 
 83 | run_benchmarks() {
 84 |     local PROMPT="$1"
 85 |     local REPETITIONS="$2"
 86 |     local MAX_TOKENS="$3"
 87 |     local DEVICE="$4"
 88 |     local LOG_FILENAME="$5"
 89 |     local MODELS_DIR="$6"
 90 | 
 91 |     if [ "$DEVICE" == "cpu" ] || [ "$DEVICE" == "cuda" ]; then
 92 |         [ "$DEVICE" == "cuda" ] && CARGO_CANDLE_FEATURES="--features cuda"
 93 | 
 94 |         cargo run --release "$CARGO_CANDLE_FEATURES" \
 95 |             --manifest-path="$SCRIPT_DIR/llama2-candle/Cargo.toml" \
 96 |             -- --local-weights "$MODELS_DIR/llama-2-7b-st/" \
 97 |             --repetitions "$REPETITIONS" \
 98 |             --prompt "$PROMPT" \
 99 |             --sample-len "$MAX_TOKENS" \
100 |             --log-file "$LOG_FILENAME"
101 |     fi
102 | }
103 | # Parse command-line arguments
104 | while [ "$#" -gt 0 ]; do
105 |     case "$1" in
106 |         -p|--prompt)
107 |             PROMPT="$2"
108 |             shift 2
109 |             ;;
110 |         -r|--repetitions)
111 |             REPETITIONS="$2"
112 |             shift 2
113 |             ;;
114 |         -m|--max_tokens)
115 |             MAX_TOKENS="$2"
116 |             shift 2
117 |             ;;
118 |         -d|--device)
119 |             DEVICE="$2"
120 |             case "$DEVICE" in
121 |                 "cuda" | "metal" | "cpu")
122 |                     ;;
123 |                 *)
124 |                     echo "Invalid value for --device. Please use 'cuda', 'gpu' or 'cpu'."
125 |                     print_usage
126 |                     ;;
127 |             esac
128 |             if [ "$DEVICE" == "cuda" ]; then
129 |                 check_cuda
130 |             fi
131 |             if [ "$DEVICE" == "metal" ]; then
132 |                 echo "Metal not supported!"
133 |                 exit 0
134 |             fi
135 |             shift 2
136 |             ;;
137 |         -lf|--log_file)
138 |             LOG_FILENAME="$2"
139 |             shift 2
140 |             ;;
141 |         -md|--models_dir)
142 |             MODELS_DIR="$2"
143 |             shift 2
144 |             ;;
145 |         -h|--help)
146 |             print_usage
147 |             ;;
148 |         *)
149 |             echo "Unknown option: $1"
150 |             print_usage
151 |             ;;
152 |     esac
153 | done
154 | 
155 | # Set default values if not provided
156 | PROMPT="${PROMPT:-"Explain what is a transformer"}"
157 | REPETITIONS="${REPETITIONS:-10}"
158 | MAX_TOKENS="${MAX_TOKENS:-100}"
159 | DEVICE="${DEVICE:-'cpu'}"
160 | LOG_FILENAME="${LOG_FILENAME:-"benchmark_$(date +'%Y%m%d%H%M%S').log"}"
161 | MODELS_DIR="${MODELS_DIR:-"./models"}"
162 | 
163 | check_platform
164 | check_rust
165 | check_python
166 | setup "$MODELS_DIR"
167 | run_benchmarks "$PROMPT" "$REPETITIONS" "$MAX_TOKENS" "$DEVICE" "$LOG_FILENAME" "$MODELS_DIR"
168 | 


--------------------------------------------------------------------------------
/bench_ctransformers/bench.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | ########################################################################################################
  4 | # Script: bench.sh
  5 | # Description: This script runs benchmarks CTransformers benchmark.
  6 | #
  7 | # Usage: ./bench.sh [OPTIONS]
  8 | # OPTIONS:
  9 | #   -p, --prompt        Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')
 10 | #   -r, --repetitions   Number of repetitions for benchmarks (default: 10)
 11 | #   -m, --max_tokens    Maximum number of tokens for benchmarks (default: 512)
 12 | #   -d, --device        Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')
 13 | #   -n, --model_name    The name of the model to benchmark (possible values: 'llama' for using Llama2, 'mistral' for using Mistral 7B v0.1)
 14 | #   -lf, --log_file     Logging file name.
 15 | #   -h, --help          Show this help message
 16 | ########################################################################################################
 17 | 
 18 | set -euo pipefail
 19 | 
 20 | print_usage() {
 21 |     echo "Usage: $0 [OPTIONS]"
 22 |     echo "OPTIONS:"
 23 |     echo "  -p, --prompt        Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')"
 24 |     echo "  -r, --repetitions   Number of repetitions for benchmarks (default: 10)"
 25 |     echo "  -m, --max_tokens    Maximum number of tokens for benchmarks (default: 512)"
 26 |     echo "  -d, --device        Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')"
 27 |     echo "  -n, --model_name    The name of the model to benchmark (possible values: 'llama' for using Llama2, 'mistral' for using Mistral 7B v0.1)"
 28 |     echo "  -lf, --log_file     Logging file name."
 29 |     echo "  -h, --help          Show this help message"
 30 |     exit 1
 31 | }
 32 | 
 33 | 
 34 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 35 | 
 36 | 
 37 | check_cuda() {
 38 |     if command -v nvcc &> /dev/null
 39 |     then
 40 |         echo -e "\nUsing CUDA"
 41 |         nvcc --version
 42 |     else
 43 |         echo -e "\nCUDA is not available."
 44 |         exit 1
 45 |     fi
 46 | }
 47 | 
 48 | check_platform() {
 49 |     local platform
 50 |     platform=$(uname -s)
 51 |     if [[ "$platform" == "Linux" ]]; then
 52 |         echo "Running on Linux."
 53 |     elif [[ "$platform" == "Darwin" ]]; then
 54 |         echo "Running on Mac OS."
 55 |     else
 56 |         echo "Unknown platform."
 57 |         exit 1
 58 |     fi
 59 | }
 60 | 
 61 | check_python() {
 62 |     if command -v python &> /dev/null; then
 63 |         PYTHON_CMD="python"
 64 |     elif command -v python3 &> /dev/null; then
 65 |         PYTHON_CMD="python3"
 66 |     else
 67 |         echo "Python is not installed."
 68 |         exit 1
 69 |     fi
 70 | }
 71 | 
 72 | setup() {
 73 |     local MODEL_NAME="${1:-llama}"
 74 |     local DEVICE="$2"
 75 | 
 76 |     echo -e "\nSetting up with $SCRIPT_DIR/setup.sh..."
 77 |     case "$DEVICE" in
 78 |         cuda)
 79 |             bash "$SCRIPT_DIR/setup.sh" "$DEVICE" "$MODEL_NAME"
 80 |             ;;
 81 |         metal)
 82 |             bash "$SCRIPT_DIR/setup.sh" "$DEVICE" "$MODEL_NAME"
 83 |             ;;
 84 |         cpu)
 85 |             bash "$SCRIPT_DIR/setup.sh" "$DEVICE" "$MODEL_NAME"
 86 |             ;;
 87 |         *)
 88 |             echo "Unsupported DEVICE: $DEVICE"
 89 |             exit 1
 90 |             ;;
 91 |     esac
 92 | }
 93 | 
 94 | run_benchmarks() {
 95 |     local PROMPT="$1"
 96 |     local REPETITIONS="$2"
 97 |     local MAX_TOKENS="$3"
 98 |     local DEVICE="$4"
 99 |     local MODEL_NAME="$5"
100 | 
101 |     # shellcheck disable=SC1091
102 |     source "$SCRIPT_DIR/venv/bin/activate"
103 |     "$PYTHON_CMD" "$SCRIPT_DIR"/bench.py \
104 |         --prompt "$PROMPT" \
105 |         --repetitions "$REPETITIONS" \
106 |         --max_tokens "$MAX_TOKENS" \
107 |         --model_name "$MODEL_NAME" \
108 |         --device "$DEVICE"
109 | }
110 | 
111 | # Parse command-line arguments
112 | while [ "$#" -gt 0 ]; do
113 |     case "$1" in
114 |         -p|--prompt)
115 |             PROMPT="$2"
116 |             shift 2
117 |             ;;
118 |         -r|--repetitions)
119 |             REPETITIONS="$2"
120 |             shift 2
121 |             ;;
122 |         -m|--max_tokens)
123 |             MAX_TOKENS="$2"
124 |             shift 2
125 |             ;;
126 |         -d|--device)
127 |             DEVICE="$2"
128 |             case "$DEVICE" in
129 |                 "cuda" | "metal" | "cpu")
130 |                     ;;
131 |                 *)
132 |                     echo "Invalid value for --device. Please use 'cuda', 'cpu' or 'metal'."
133 |                     print_usage
134 |                     ;;
135 |             esac
136 |             if [ "$DEVICE" == "cuda" ]; then
137 |                 check_cuda
138 |             else
139 |                 echo "Not supported for $DEVICE"
140 |                 exit 1
141 |             fi
142 |             shift 2
143 |             ;;
144 |         -n|--model_name)
145 |             MODEL_NAME="$2"
146 |             shift 2
147 |             ;;
148 |         -h|--help)
149 |             print_usage
150 |             ;;
151 |         *)
152 |             echo "Unknown option: $1"
153 |             print_usage
154 |             ;;
155 |     esac
156 | done
157 | 
158 | check_platform
159 | check_python
160 | setup "$MODEL_NAME" "$DEVICE"
161 | 
162 | # Set default values if not provided
163 | PROMPT="${PROMPT:-"Write an essay about the transformer model architecture"}"
164 | REPETITIONS="${REPETITIONS:-10}"
165 | MAX_TOKENS="${MAX_TOKENS:-512}"
166 | DEVICE="${DEVICE:-'cuda'}"
167 | MODEL_NAME="${MODEL_NAME:-"llama"}"
168 | 
169 | run_benchmarks "$PROMPT" "$REPETITIONS" "$MAX_TOKENS" "$DEVICE" "$MODEL_NAME"
170 | 


--------------------------------------------------------------------------------
/bench_pytorch/bench.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | 
  4 | import torch
  5 | from transformers import AutoModelForCausalLM, AutoTokenizer
  6 | 
  7 | sys.path.append(os.getcwd())
  8 | 
  9 | from common.base import BaseBenchmarkClass  # noqa
 10 | from common.utils import launch_cli, make_report  # noqa
 11 | 
 12 | 
 13 | class PyTorchBenchmark(BaseBenchmarkClass):
 14 |     def __init__(
 15 |         self,
 16 |         model_path: str,
 17 |         model_name: str,
 18 |         benchmark_name: str,
 19 |         precision: str,
 20 |         device: str,
 21 |         experiment_name: str,
 22 |     ) -> None:
 23 |         super().__init__(
 24 |             model_name=model_name,
 25 |             model_path=model_path,
 26 |             benchmark_name=benchmark_name,
 27 |             experiment_name=experiment_name,
 28 |             precision=precision,
 29 |             device=device,
 30 |         )
 31 | 
 32 |     @torch.inference_mode()
 33 |     def load_model_and_tokenizer(self):
 34 |         self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
 35 |         precision_dtype_mapping = {"float16": torch.float16, "float32": torch.float32}
 36 | 
 37 |         if self.precision in ["float16", "float32"]:
 38 |             device = "cuda:0" if self.device == "cuda" else self.device
 39 |             model_args = {
 40 |                 "device_map": device,
 41 |                 "torch_dtype": precision_dtype_mapping[self.precision],
 42 |             }
 43 |             self.model = AutoModelForCausalLM.from_pretrained(
 44 |                 self.model_path, **model_args
 45 |             )
 46 |         elif self.precision in ["int4", "int8"] and self.device in ["cuda:0", "cuda"]:
 47 |             from transformers import BitsAndBytesConfig
 48 | 
 49 |             bnb_config = (
 50 |                 BitsAndBytesConfig(load_in_8bit=True)
 51 |                 if self.precision == "int8"
 52 |                 else BitsAndBytesConfig(
 53 |                     load_in_4bit=True,
 54 |                     bnb_4bit_use_double_quant=True,
 55 |                     bnb_4bit_quant_type="nf4",
 56 |                     bnb_4bit_compute_dtype=torch.float16,
 57 |                 )
 58 |             )
 59 | 
 60 |             if self.precision == "int8":
 61 |                 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 62 | 
 63 |             self.model = AutoModelForCausalLM.from_pretrained(
 64 |                 self.model_path, device_map=self.device, quantization_config=bnb_config
 65 |             )
 66 |         else:
 67 |             raise ValueError(
 68 |                 f"Invalid configuration: {self.device}, {self.precision}"
 69 |                 "INT4/8 requires CUDA to execute."
 70 |             )
 71 |         return self
 72 | 
 73 |     def preprocess(self, prompt: str, chat_mode: bool = True, for_benchmarks=True):
 74 |         if chat_mode:
 75 |             template = self.get_chat_template_with_instruction(
 76 |                 prompt=prompt, for_benchmarks=for_benchmarks
 77 |             )
 78 |             prompt = self.tokenizer.apply_chat_template(template, tokenize=False)
 79 | 
 80 |         tokenized_input = self.tokenizer.encode(text=prompt)
 81 |         tensor = self.tokenizer.encode(prompt, return_tensors="pt").to(self.device)
 82 |         return {
 83 |             "prompt": prompt,
 84 |             "input_tokens": tokenized_input,
 85 |             "tensor": tensor,
 86 |             "num_input_tokens": len(tokenized_input),
 87 |         }
 88 | 
 89 |     def run_model(self, inputs: dict, max_tokens: int, temperature: float) -> dict:
 90 |         tensor = inputs["tensor"]
 91 |         num_input_tokens = inputs["num_input_tokens"]
 92 | 
 93 |         output = (
 94 |             self.model.generate(
 95 |                 input_ids=tensor,
 96 |                 max_new_tokens=max_tokens,
 97 |                 temperature=temperature,
 98 |                 do_sample=True,
 99 |                 pad_token_id=self.tokenizer.eos_token_id,
100 |             )
101 |             .detach()
102 |             .tolist()[0]
103 |         )
104 | 
105 |         output_tokens = (
106 |             output[num_input_tokens:] if len(output) > num_input_tokens else output
107 |         )
108 |         return {"output_tokens": output_tokens, "num_output_tokens": len(output_tokens)}
109 | 
110 |     def postprocess(self, output: dict) -> str:
111 |         output_tokens = output["output_tokens"]
112 |         return self.tokenizer.decode(output_tokens, skip_special_tokens=True)
113 | 
114 |     def on_exit(self):
115 |         if self.device == "cuda:0":
116 |             del self.model
117 |             torch.cuda.synchronize()
118 |         else:
119 |             del self.model
120 | 
121 | 
122 | if __name__ == "__main__":
123 |     parser = launch_cli(
124 |         description="HuggingFace Transformers Benchmark (PyTorch backend)"
125 |     )
126 |     args = parser.parse_args()
127 |     model_folder = os.path.join(os.getcwd(), "models")
128 |     model_name = (
129 |         f"{args.model_name}-2-7b-chat-hf"
130 |         if args.model_name == "llama"
131 |         else f"{args.model_name}-7b-v0.1-instruct-hf"
132 |     )
133 |     model_path = os.path.join(model_folder, model_name)
134 |     precisions_mapping = {
135 |         "cpu": ("float32",),
136 |         "cuda": ("float32", "float16", "int8", "int4"),
137 |         "metal": ("float32", "float16"),
138 |     }
139 |     runner_dict = {}
140 |     for device, precisions in precisions_mapping.items():
141 |         runner_dict[device] = [
142 |             {"precision": precision, "model_path": model_path}
143 |             for precision in precisions
144 |         ]
145 |     report = make_report(
146 |         args=args,
147 |         benchmark_class=PyTorchBenchmark,
148 |         runner_dict=runner_dict,
149 |         benchmark_name="HF-Transformers (PyTorch Backend)",
150 |         is_bench_pytorch=True,
151 |     )
152 | 


--------------------------------------------------------------------------------
/bench_ctranslate/bench.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | ########################################################################################################
  4 | # Script: bench.sh
  5 | # Description: This script runs benchmarks CTranslate2 llama benchmark.
  6 | #
  7 | # Usage: ./bench.sh [OPTIONS]
  8 | # OPTIONS:
  9 | #   -p, --prompt        Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')
 10 | #   -r, --repetitions   Number of repetitions for benchmarks (default: 10)
 11 | #   -m, --max_tokens    Maximum number of tokens for benchmarks (default: 512)
 12 | #   -d, --device        Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')
 13 | #   -n, --model_name    The name of the model to benchmark (possible values: 'llama' for using Llama2, 'mistral' for using Mistral 7B v0.1)
 14 | #   -lf, --log_file     Logging file name.
 15 | #   -h, --help          Show this help message
 16 | ########################################################################################################
 17 | 
 18 | set -euo pipefail
 19 | 
 20 | print_usage() {
 21 |     echo "Usage: $0 [OPTIONS]"
 22 |     echo "OPTIONS:"
 23 |     echo "  -p, --prompt        Prompt for benchmarks (default: 'Write an essay about the transformer model architecture')"
 24 |     echo "  -r, --repetitions   Number of repetitions for benchmarks (default: 10)"
 25 |     echo "  -m, --max_tokens    Maximum number of tokens for benchmarks (default: 512)"
 26 |     echo "  -d, --device        Device for benchmarks (possible values: 'metal', 'cuda', and 'cpu', default: 'cuda')"
 27 |     echo "  -n, --model_name    The name of the model to benchmark (possible values: 'llama' for using Llama2, 'mistral' for using Mistral 7B v0.1)"
 28 |     echo "  -lf, --log_file     Logging file name."
 29 |     echo "  -h, --help          Show this help message"
 30 |     exit 1
 31 | }
 32 | 
 33 | CURRENT_DIR="$(pwd)"
 34 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 35 | 
 36 | echo "$SCRIPT_DIR"
 37 | 
 38 | check_platform() {
 39 |     local platform
 40 |     platform=$(uname -s)
 41 |     if [[ "$platform" == "Linux" ]]; then
 42 |         echo "Running on Linux."
 43 |     elif [[ "$platform" == "Darwin" ]]; then
 44 |         echo "Running on Mac OS."
 45 |     else
 46 |         echo "Unknown platform."
 47 |         exit 1
 48 |     fi
 49 | }
 50 | 
 51 | check_cuda() {
 52 |     if command -v nvcc &> /dev/null
 53 |     then
 54 |         echo -e "\nUsing CUDA"
 55 |         nvcc --version
 56 |     else
 57 |         echo -e "\nCUDA is not available."
 58 |         exit 1
 59 |     fi
 60 | }
 61 | 
 62 | check_python() {
 63 |     if command -v python &> /dev/null; then
 64 |         PYTHON_CMD="python"
 65 |     elif command -v python3 &> /dev/null; then
 66 |         PYTHON_CMD="python3"
 67 |     else
 68 |         echo "Python is not installed."
 69 |         exit 1
 70 |     fi
 71 | }
 72 | 
 73 | setup() {
 74 |     local MODEL_NAME="${1:-llama}"
 75 | 
 76 |     if [[ "$MODEL_NAME" == "llama" ]]; then
 77 |         local model_pattern="$CURRENT_DIR/models/llama-2-7b-chat-ctranslate2-*"
 78 |     elif [[ "$MODEL_NAME" == "mistral" ]]; then
 79 |         local model_pattern="$CURRENT_DIR/models/mistral-7b-v0.1-instruct-ctranslate2-*"
 80 |     else
 81 |         echo "No such model is supported"
 82 |         exit 1
 83 |     fi
 84 | 
 85 |     matching_dirs=$(ls -d "$model_pattern" 2>/dev/null)
 86 | 
 87 |     if [ -n "$matching_dirs" ]; then
 88 |         echo "Already exists skipping setup"
 89 |     else
 90 |         echo -e "\nSetting up with $SCRIPT_DIR/setup.sh..."
 91 |         bash "$SCRIPT_DIR"/setup.sh "$MODEL_NAME"
 92 |     fi
 93 | }
 94 | 
 95 | run_benchmarks() {
 96 |     local PROMPT="$1"
 97 |     local REPETITIONS="$2"
 98 |     local MAX_TOKENS="$3"
 99 |     local DEVICE="$4"
100 |     local MODEL_NAME="$5"
101 | 
102 |     # shellcheck disable=SC1091
103 |     source "$SCRIPT_DIR/venv/bin/activate"
104 |     "$PYTHON_CMD" "$SCRIPT_DIR"/bench.py \
105 |         --prompt "$PROMPT" \
106 |         --repetitions "$REPETITIONS" \
107 |         --max_tokens "$MAX_TOKENS" \
108 |         --model_name "$MODEL_NAME" \
109 |         --device "$DEVICE"
110 | }
111 | 
112 | # Parse command-line arguments
113 | while [ "$#" -gt 0 ]; do
114 |     case "$1" in
115 |         -p|--prompt)
116 |             PROMPT="$2"
117 |             shift 2
118 |             ;;
119 |         -r|--repetitions)
120 |             REPETITIONS="$2"
121 |             shift 2
122 |             ;;
123 |         -m|--max_tokens)
124 |             MAX_TOKENS="$2"
125 |             shift 2
126 |             ;;
127 |         -d|--device)
128 |             DEVICE="$2"
129 |             case "$DEVICE" in
130 |                 "cuda" | "metal" | "cpu")
131 |                     ;;
132 |                 *)
133 |                     echo "Invalid value for --device. Please use 'cuda', 'cpu' or 'metal'."
134 |                     print_usage
135 |                     ;;
136 |             esac
137 |             if [ "$DEVICE" == "cuda" ]; then
138 |                 check_cuda
139 |             else
140 |                 echo "Not supported for $DEVICE"
141 |                 exit 1
142 |             fi
143 |             shift 2
144 |             ;;
145 |         -n|--model_name)
146 |             MODEL_NAME="$2"
147 |             shift 2
148 |             ;;
149 |         -h|--help)
150 |             print_usage
151 |             ;;
152 |         *)
153 |             echo "Unknown option: $1"
154 |             print_usage
155 |             ;;
156 |     esac
157 | done
158 | 
159 | 
160 | # Set default values if not provided
161 | PROMPT="${PROMPT:-"Write an essay about the transformer model architecture"}"
162 | REPETITIONS="${REPETITIONS:-10}"
163 | MAX_TOKENS="${MAX_TOKENS:-512}"
164 | DEVICE="${DEVICE:-'cuda'}"
165 | MODEL_NAME="${MODEL_NAME:-"llama"}"
166 | 
167 | check_platform
168 | check_python
169 | setup "$MODEL_NAME"
170 | 
171 | run_benchmarks "$PROMPT" "$REPETITIONS" "$MAX_TOKENS" "$DEVICE" "$MODEL_NAME"
172 | 


--------------------------------------------------------------------------------
/common/utils.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import logging
  4 | import os
  5 | import sys
  6 | from collections import defaultdict
  7 | from datetime import datetime
  8 | 
  9 | import numpy as np
 10 | 
 11 | 
 12 | def get_logger(
 13 |     benchmark_name: str, log_file_path: str = None, logging_level=logging.INFO
 14 | ):
 15 |     logger = logging.getLogger(benchmark_name)
 16 |     if not logger.handlers:  # Check if handlers have already been added
 17 |         logger.setLevel(logging_level)
 18 | 
 19 |         formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
 20 | 
 21 |         stream_handler = logging.StreamHandler(sys.stdout)
 22 |         stream_handler.setFormatter(formatter)
 23 |         logger.addHandler(stream_handler)
 24 | 
 25 |         if log_file_path is None:
 26 |             logfile_name = f"benchmark_{benchmark_name}_{datetime.now().strftime('%Y%m%d%H%M%S')}.log"
 27 |             log_file_path = os.path.join(os.getcwd(), "logs", logfile_name)
 28 | 
 29 |         file_handler = logging.FileHandler(log_file_path)
 30 |         file_handler.setFormatter(formatter)
 31 |         logger.addHandler(file_handler)
 32 | 
 33 |     return logger
 34 | 
 35 | 
 36 | def launch_cli(description: str):
 37 |     parser = argparse.ArgumentParser(description=description)
 38 |     parser.add_argument(
 39 |         "--prompt",
 40 |         type=str,
 41 |         help="The prompt for the model.",
 42 |     )
 43 |     parser.add_argument("--max_tokens", type=int, help="The maximum number of tokens.")
 44 | 
 45 |     parser.add_argument(
 46 |         "--repetitions",
 47 |         type=int,
 48 |         help="The number of repetitions for the benchmark.",
 49 |     )
 50 |     parser.add_argument(
 51 |         "--device",
 52 |         help="Device to use for the benchmark.",
 53 |     )
 54 |     parser.add_argument(
 55 |         "--model_name",
 56 |         type=str,
 57 |         help="Path to the models directory.",
 58 |     )
 59 | 
 60 |     parser.add_argument(
 61 |         "--temperature",
 62 |         type=float,
 63 |         help="Temperature to use.",
 64 |     )
 65 | 
 66 |     return parser
 67 | 
 68 | 
 69 | def make_report(
 70 |     args, benchmark_class, runner_dict, benchmark_name, is_bench_pytorch: bool = False
 71 | ):
 72 |     experiment_name = f"{benchmark_name}-{str(datetime.now())}"
 73 |     report = defaultdict(lambda: defaultdict(float))
 74 |     all_answers = {}
 75 | 
 76 |     for instance in runner_dict[args.device]:
 77 |         model_path, precision = instance["model_path"], instance["precision"]
 78 |         benchmark = benchmark_class(
 79 |             model_path=model_path,
 80 |             model_name=args.model_name,
 81 |             benchmark_name=benchmark_name,
 82 |             precision=precision,
 83 |             device=args.device,
 84 |             experiment_name=experiment_name,
 85 |         ).load_model_and_tokenizer()
 86 | 
 87 |         logger = benchmark.logger
 88 | 
 89 |         # First we do benchmarking
 90 |         benchmark.benchmark(
 91 |             prompt=args.prompt,
 92 |             max_tokens=args.max_tokens,
 93 |             repetitions=args.repetitions,
 94 |             temperature=args.temperature,
 95 |         )
 96 | 
 97 |         # Make report for benchmarks
 98 |         # Memory seems to be stay the same, so we can take the max of it
 99 | 
100 |         report[f"{args.model_name}-{benchmark_name} (token/sec)"][precision] = {
101 |             "mean": np.mean(benchmark.tps_results),
102 |             "std": np.std(benchmark.tps_results),
103 |         }
104 | 
105 |         report[f"{args.model_name}-{benchmark_name} (memory usage)"][precision] = {
106 |             "usage": max(benchmark.memory_usage_results)
107 |         }
108 | 
109 |         # Second we get the answers
110 |         benchmark.get_answers()
111 |         all_answers[precision] = benchmark.answers
112 | 
113 |     # Make the final report
114 | 
115 |     for framework, quantizations in report.items():
116 |         for quantization, stats in quantizations.items():
117 |             if framework == f"{args.model_name}-{benchmark_name} (memory usage)":
118 |                 logger.info(f"{framework}, {quantization}: {stats['usage']} MB")
119 |             else:
120 |                 logger.info(
121 |                     f"{framework}, {quantization}: {stats['mean']:.2f} ± {stats['std']:.2f}"
122 |                 )
123 |     # Finally write the quality checks results
124 |     logger.info("Writing the model completion for empirical tests")
125 |     with open(benchmark.answers_json_path, "w") as json_file:
126 |         json.dump(all_answers, json_file)
127 | 
128 |         logger.info("Benchmarking Fininshed")
129 |     markdown_content = make_markdown(
130 |         input_json_path=benchmark.answers_json_path, is_bench_pytorch=is_bench_pytorch
131 |     )
132 | 
133 |     with open(os.path.join(benchmark.log_folder, "quality.md"), "w") as readme_file:
134 |         readme_file.write("\n".join(markdown_content))
135 | 
136 |     print("README.md has been created with the table.")
137 | 
138 | 
139 | def make_markdown(input_json_path: str, is_bench_pytorch: bool = False):
140 |     with open(input_json_path, "r") as file:
141 |         data = json.load(file)
142 | 
143 |     precisions = list(data.keys())
144 |     markdown_content = []
145 | 
146 |     # Helper function to create a markdown table row
147 |     def create_row(items):
148 |         return "| " + " | ".join(items) + " |"
149 | 
150 |     # Build headers based on the mode
151 |     if is_bench_pytorch:
152 |         headers = ["Question"] + precisions
153 |     else:
154 |         headers = ["Question"] + precisions + ["Ground Truth"]
155 | 
156 |     markdown_content.append(create_row(headers))
157 |     markdown_content.append(create_row(["---"] * len(headers)))
158 | 
159 |     # Build the Markdown
160 |     for idx, question in enumerate(data[precisions[0]]):
161 |         question_text = question.get(
162 |             "prompt" if is_bench_pytorch else "question", ""
163 |         ).replace("\n", " ")
164 | 
165 |         answers = [
166 |             data[precision][idx]["actual"].replace("\n", "<br>")
167 |             for precision in precisions
168 |         ]
169 |         row_items = [question_text] + answers
170 | 
171 |         if not is_bench_pytorch:
172 |             ground_truths = [
173 |                 data[precision][idx]["expected"].replace("\n", "<br>")
174 |                 for precision in precisions
175 |             ]
176 |             row_items += ground_truths
177 |         markdown_content.append(create_row(row_items))
178 | 
179 |     return markdown_content
180 | 


--------------------------------------------------------------------------------