├── .gitignore ├── .vscode └── settings.json ├── LICENSE ├── README.md ├── assets ├── docs │ ├── logo.png │ └── sample_results │ │ ├── Llama-2-7b-hf_L40.json │ │ ├── codellama-13b-oasst-sft-v10_H100.json │ │ └── mpt-7b_L40.json └── setup │ └── runpod.sh ├── configs ├── hf_test.json └── llmvm_test.json ├── metrics.py ├── model.py ├── run.py ├── src ├── hf.py ├── hw.py ├── llmvm.py ├── logger.py ├── pointer.py └── util.py └── tools ├── columns.py └── graph.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | 162 | # project specific 163 | .DS_Store 164 | LLM-VM/ 165 | finetuned_models/ 166 | assets/sandbox/ 167 | *_metrics.json 168 | *_model.json 169 | report_*.json 170 | results/ 171 | events.log 172 | *run_*.json 173 | reports/ 174 | tools/*.csv 175 | 176 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.analysis.extraPaths": [ 3 | "./src/LLM-VM/src" 4 | ] 5 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Mehmet 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # LLM Speed Benchmark (LLMSB) 2 | 3 |

4 | 5 |

6 | 7 | 🚧 LLM Speed Benchmark (LLMSB) is currently in beta (v0). Please do not use this in production, or use it at your own risk. We're still ironing out some kinks and improving functionality. If you encounter any bugs or have suggestions, kindly report them under [ISSUES](https://github.com/anarchy-ai/benchllm/issues). Your feedback is invaluable! 8 | 9 | ## About 10 | 11 | LLM Speed Benchmark (LLMSB) is a benchmarking tool for assessing LLM models' performance across different hardware platforms. Its ultimate goal is to compile a comprehensive dataset detailing LLM models' performance on various systems, enabling users to more effectively choose the right LLM model(s) for their projects. 12 | 13 | ## Limtations 14 | 15 | LLMSB is on v0, so it has limitations: 16 | - Only designed to run on debian based operating systems, aka it's not designed to run on Windows. This is because LLMSB uses neofetch and nvidia-smi to gather metrics under the hood and the filepath logic is based on unix operating systems. 17 | - Due to how metrics are recorded, it can take the metrics collector up to 1 second to do a collection. This means that, at the fast, we can collect hardware metrics every 1 second. 18 | - LLMSB only uses HuggingFace to load and run models. This works for now, but the goal is to have LLMSB support muliple frameworks, not just HuggingFace. 19 | - Currently, all models are ran though the logic presented in the run_llm() function, located in src/hf.py, where the functions AutoTokenizer() and AutoModelForCausalLM() are used to load and run a model. This works but it limits how we can config/optmize specific models. Knowing this, the goal is to create seperate classes for each popular model and utilize HuggingFace's model specifc classes, like LlamaTokenizer & LlamaForCausalLM, instead. 20 | - LLMSB only gathers general, high level, metrics. In the future, we would like to gather lower level metrics. We think this can partly be done using Pytorch's [porfiler wrapper](https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html). 21 | 22 | ## Sample Outputs 23 | 24 | ### November 22, 2023 25 | 26 | LLMSB was ran/test on a L40 and H100 GPU though [RunPod](https://www.runpod.io/). In those benchmarks the models [llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf), [codellama-13b-oasst-sft-v10](https://huggingface.co/OpenAssistant/codellama-13b-oasst-sft-v10), & [mpt-7b](https://huggingface.co/mosaicml/mpt-7b) where tested. 27 | 28 | Checkout the results [HERE](https://github.com/anarchy-ai/llm-speed-benchmark/tree/main/assets/docs/sample_results). If any errors/issues are noticed, please repport them to ISSUES. 29 | 30 | ## Setup 31 | 32 | 1. Create and activate python environment: 33 | ``` 34 | python3 -m venv env 35 | source env/bin/activate 36 | ``` 37 | 38 | 2. Install package dependencies (using APT): 39 | ``` 40 | apt -y update 41 | apt install -y vim 42 | apt install -y neofetch 43 | ``` 44 | 45 | 3. Install python dependencies: 46 | ``` 47 | pip3 install transformers 48 | pip3 install psutil 49 | pip3 install gputil 50 | pip3 install tabulate 51 | pip3 install sentencepiece 52 | pip3 install protobuf 53 | ``` 54 | 55 | 4. Install Pytorch (to determine how to install Pytorch for your system, checkout their tool on: https://pytorch.org/): 56 | ``` 57 | # install pytorch stable build, for linux, using CUDA 12.1: 58 | pip3 install torch torchvision torchaudio 59 | ``` 60 | 61 | 4. Install [LLM-VM](https://github.com/anarchy-ai/LLM-VM/tree/main): 62 | ``` 63 | pip install llm-vm 64 | ``` 65 | 66 | 5. (optional) If you are using models like LLAMA, you will need a HuggingFace access token. Setup your access token [HERE](https://huggingface.co/settings/tokens) then save your token to your console by running the following command: 67 | ``` 68 | huggingface-cli login 69 | ``` 70 | 71 | ## How To Run 72 | 73 | 1. Complete the steps listed in the __Setup__ section. 74 | 75 | 2. To configure your set, you need to create a json file with the following parameters (here is an example): 76 | - NOTE: not every framework supports the same parameters 77 | ``` 78 | { 79 | "model": "bigscience/bloom-560m", # the model's path/repo on HuggingFace (https://huggingface.co/models) 80 | "prompt": "Hello World!", # the prompt you want to input into the LLM model 81 | "device": "cuda:0", # the device you want to run the LLM model on (GPU/CPU) 82 | "max_length": 50, # the maximun length of the generated tokens 83 | "temperature": 0.9, # temperatue value for the LLM model 84 | "top_k": 50, # top-k value for the LLM model 85 | "top_p": 0.9, # top-p value for the LLM model 86 | "num_return_sequences": 1, # the number of independently ran instances of the model 87 | "time_delay": 0, # the time delay (seconds) the metrics-collecter will wait per interation 88 | "model_start_pause": 1, # the time (seconds) the test will wait BEFORE running the LLM model 89 | "model_end_pause": 1 # the time (seconds) the test will wait AFTER the LLM model is done running, 90 | "framework": "llm-vm" # the name of the framework/library you want to use to run the model 91 | } 92 | ``` 93 | 94 | 3. Using the path to the config file you create in the previous step, run the following to start the benchmark (pick one option): 95 | ``` 96 | # run one benchmark 97 | python3 run.py --config ./configs/llmvm_test.json 98 | 99 | # run more then one benchmark (in this case 3) 100 | python3 run.py --config ./configs/llmvm_test.json --loops 3 101 | ``` 102 | 103 | 4. After the benchmark is done running, check out the final results in a file that should look something like this: 104 | ``` 105 | report_2023-11-25_05:55:04.207515_utc_1ffc4fa7-3aa9-4878-b874-1ff445e1ff8a.json 106 | ``` 107 | 108 | ## Setting Up RunPod: 109 | 110 | 1. Setup RunPod, setup your ssh cert/key, and get a pod running. You can access your pod(s) here: https://www.runpod.io/console/pods 111 | 112 | 2. Click the "Connect" button to get the ssh connection info. This info should look something like this: 113 | ``` 114 | ssh root&12.345.678.90 -p 12345 -i ~/.ssh/id_example 115 | ``` 116 | - This commad will be formated like this: 117 | ``` 118 | ssh @ -p -i 119 | ``` 120 | 121 | 3. Using the command in step #2, you should be able to ssh into the pod and use the GPU you selected in that RunPod pod. 122 | 123 | 4. If you want to copy a file from the pod to your local machine, you would run command in this format (this is refering to the variables shown in step #2): 124 | ``` 125 | scp -P -i @: 126 | ``` 127 | - Here is an example of such a command: 128 | ``` 129 | scp -P 12345 -i ~/.ssh/id_example @:/root/test.txt /home/user1/Downloads/ 130 | ``` 131 | 132 | 5. After you are done with the pod, shut it down or pause it. But warning, if you pause it you will still get charged, just way less. 133 | 134 | ## Great Sources: 135 | 136 | - Great datasets of prompts (if you can't come up with any): 137 | - https://github.com/f/awesome-chatgpt-prompts/tree/main 138 | - https://huggingface.co/datasets/bigscience/P3 139 | - https://www.kaggle.com/datasets/ratthachat/writing-prompts 140 | 141 | - Learn more about LLM parameters: https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig 142 | 143 | - Great benchmark to benchmark cloud-based LLM models: https://github.com/ray-project/llmperf 144 | 145 | - Cool LLM intelligence leadboards: 146 | - https://fasteval.github.io/FastEval/ 147 | - https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard 148 | -------------------------------------------------------------------------------- /assets/docs/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anarchy-ai/llm-speed-benchmark/2b2917c390074fba879bf5090139960d51999561/assets/docs/logo.png -------------------------------------------------------------------------------- /assets/setup/runpod.sh: -------------------------------------------------------------------------------- 1 | # Website: https://www.runpod.io/console/pods 2 | # Service: RunPod 3 | 4 | apt -y update 5 | apt install -y vim 6 | apt install -y neofetch 7 | 8 | pip3 install transformers 9 | pip3 install psutil 10 | pip3 install gputil 11 | pip3 install tabulate 12 | pip3 install torch torchvision torchaudio 13 | pip3 install matplotlib 14 | pip3 install sentencepiece 15 | pip3 install protobuf 16 | 17 | 18 | pip install llm-vm 19 | 20 | huggingface-cli login 21 | 22 | -------------------------------------------------------------------------------- /configs/hf_test.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": "facebook/opt-iml-max-1.3b", 3 | "prompt": "Hello World!", 4 | "device": "cuda:0", 5 | 6 | "max_length": 1000, 7 | "temperature": 0.9, 8 | "top_k": 50, 9 | "top_p": 0.9, 10 | "num_return_sequences": 1, 11 | 12 | "time_delay": 0, 13 | "model_start_pause": 1, 14 | "model_end_pause": 1, 15 | "dtype": "bfloat16", 16 | "framework": "huggingface" 17 | } -------------------------------------------------------------------------------- /configs/llmvm_test.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": "neo", 3 | "prompt": "QUESTION: what is your goal in life ANSWER: ", 4 | 5 | "max_length": null, 6 | "temperature": 0.9, 7 | "top_k": null, 8 | "top_p": null, 9 | "num_return_sequences": null, 10 | 11 | "time_delay": 0, 12 | "model_start_pause": 1, 13 | "model_end_pause": 1, 14 | "dtype": null, 15 | "framework": "llm-vm" 16 | } 17 | -------------------------------------------------------------------------------- /metrics.py: -------------------------------------------------------------------------------- 1 | import time 2 | import os 3 | import sys 4 | import json 5 | import signal 6 | import argparse 7 | import uuid 8 | 9 | sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), "src")) 10 | import hw 11 | import logger 12 | 13 | # config arguments 14 | parser = argparse.ArgumentParser(description='run hardware performance/metrics collector') 15 | parser.add_argument('--time-delay', type=int, default=1, help='the time dely, in seconds, for each collection interation') 16 | parser.add_argument('--uuid', type=str, default=str(uuid.uuid4()), help='the UUID for the collection') 17 | 18 | # global variable to determine when the collection loop should stop 19 | running = True 20 | 21 | # signal handler 22 | def signal_handler(signum, frame): 23 | global running 24 | running = False 25 | 26 | if __name__ == "__main__": 27 | args = parser.parse_args() 28 | 29 | signal.signal(signal.SIGTERM, signal_handler) 30 | signal.signal(signal.SIGINT, signal_handler) 31 | 32 | logger.info(f"{args.uuid} - metrics collection has started...") 33 | 34 | metrics = {} 35 | counter = 0 36 | while running: 37 | timestamp = str(time.time()) 38 | metrics[timestamp] = hw.get_all() 39 | logger.info(f"{args.uuid} - metrics collector - Collected metrics for the {counter+1} time, now waiting for {args.time_delay} sec") 40 | counter += 1 41 | time.sleep(args.time_delay) 42 | 43 | logger.info(f"{args.uuid} - metrics collecton has concluded!") 44 | 45 | filepath = f"{args.uuid}_metrics.json" 46 | with open(str(filepath), "w") as file: 47 | json.dump(metrics, file, indent=4) 48 | 49 | logger.info(f"{args.uuid} - metrics collector - Saved {len(metrics.keys())} data points to file {filepath}") 50 | 51 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import time 3 | import json 4 | import os 5 | import sys 6 | import copy 7 | import gc 8 | import signal 9 | import argparse 10 | import uuid 11 | 12 | sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), "src")) 13 | import logger 14 | import pointer 15 | 16 | def float_or_none(value): 17 | if value.lower() == 'none': 18 | return None 19 | try: 20 | return float(value) 21 | except ValueError: 22 | raise argparse.ArgumentTypeError(f"{value} must be a floating point number or 'None'") 23 | 24 | def int_or_none(value): 25 | if value.lower() == 'none': 26 | return None 27 | try: 28 | return int(value) 29 | except ValueError: 30 | raise argparse.ArgumentTypeError(f"{value} must be an int number or 'None'") 31 | 32 | def str_or_none(value): 33 | if value.lower() == 'none': 34 | return None 35 | try: 36 | return str(value) 37 | except ValueError: 38 | raise argparse.ArgumentTypeError(f"{value} must be a string or 'None'") 39 | 40 | parser = argparse.ArgumentParser(description='run llm model hosted on HuggingFace') 41 | 42 | """ 43 | November 21, 2023 44 | The default values and help values for most of these parameters were taken directly from huggingface documentation: 45 | https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig 46 | """ 47 | parser.add_argument('--max_length', type=int_or_none, default=20, help='The maximum length the generated tokens can have. Corresponds to the length of the input prompt + max_new_tokens. Its effect is overridden by max_new_tokens, if also set.') 48 | parser.add_argument('--temperature', type=float_or_none, default=1.0, help='The value used to modulate the next token probabilities.') 49 | parser.add_argument('--top_k', type=int_or_none, default=50, help='The number of highest probability vocabulary tokens to keep for top-k-filtering.') 50 | parser.add_argument('--top_p', type=float_or_none, default=1.0, help='If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.') 51 | parser.add_argument('--num_return_sequences', type=int_or_none, default=1, help='The number of independently computed returned sequences for each element in the batch.') 52 | 53 | parser.add_argument('--uuid', type=str_or_none, default=str(uuid.uuid4()), help='The UUID for the collection') 54 | parser.add_argument('--prompt', type=str_or_none, default="Hello World", help='Text prompt for the LLM model to respond too') 55 | parser.add_argument('--model', type=str_or_none, default="", help='Huggingface repo/path to LLM model') 56 | parser.add_argument('--device', type=str_or_none, default="", help='Device to run the model on, this can be "cpu" or "cuda:N"') 57 | parser.add_argument('--dtype', type=str_or_none, default="bfloat16", help="A tensor's data type, this will effect the overall accuracy and hardware performance for a model") 58 | parser.add_argument('--framework', type=str_or_none, default=None, help="Perfered framework to run LLM model on (huggingface, LLM-VM, etc)") 59 | 60 | # signal handler 61 | def signal_handler(signum, frame): 62 | sys.exit(1) 63 | 64 | if __name__ == "__main__": 65 | signal.signal(signal.SIGTERM, signal_handler) 66 | signal.signal(signal.SIGINT, signal_handler) 67 | 68 | args = parser.parse_args() 69 | 70 | if args.model == "": 71 | logger.error(f"{args.uuid} - model not provided, please provide a model from huggingface: https://huggingface.co/models") 72 | sys.exit(1) 73 | 74 | logger.info(f"{args.uuid} - running model with following parameters {str(args)}") 75 | 76 | start_time = time.time() 77 | 78 | logger.info(f"{args.uuid} - model {args.model} started at epoch time {start_time} seconds") 79 | 80 | # (11-30-2023) Change this part, for finetuning or custom model running 81 | ############################################################################################################## 82 | 83 | try: 84 | output = pointer.execute_llm(args.framework, args.model, args.prompt, args.device, args.dtype, { 85 | "max_length": args.max_length, 86 | "temperature": args.temperature, 87 | "top_k": args.top_k, 88 | "top_p": args.top_p, 89 | "num_return_sequences": args.num_return_sequences 90 | }) 91 | except Exception as err: 92 | logger.critical(f"{args.uuid} - existing... due to model {args.model} failed to run due to error: {err}") 93 | sys.exit(1) 94 | 95 | ############################################################################################################## 96 | 97 | end_time = time.time() 98 | 99 | logger.info(f"{args.uuid} - model {args.model} completed at epoch time {end_time} seconds") 100 | 101 | output["run_period"] = { 102 | "started": start_time, 103 | "ended": end_time 104 | } 105 | 106 | # delete cachue and variables to free up resources for better metrics collecting 107 | final_result = copy.deepcopy(output) 108 | logger.info(f"{args.uuid} - calling Python's garbage collector and empting cuda cache is a GPU was used") 109 | gc.collect() 110 | del output 111 | if "cuda" in args.model: 112 | torch.cuda.empty_cache() 113 | 114 | filepath = f"{args.uuid}_model.json" 115 | with open(str(filepath), "w") as file: 116 | json.dump(final_result, file, indent=4) 117 | 118 | logger.info(f"{args.uuid} - model running - saved output for model run to file {filepath}") 119 | -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timezone 2 | import subprocess 3 | import argparse 4 | import time 5 | import os 6 | import signal 7 | import uuid 8 | import sys 9 | import json 10 | 11 | sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), "src")) 12 | import logger 13 | import util 14 | import hf 15 | import hw 16 | 17 | # config arguments 18 | parser = argparse.ArgumentParser(description='Run performance benchmark for an LLM model') 19 | parser.add_argument('--name', type=str, default=None, help='name of this performance benchmark run') 20 | parser.add_argument('--config', type=str, default=None, help='path to config file that will be used for the performance benchmark') 21 | parser.add_argument('--loops', type=int, default=1, help='number of times the performance benchmark will be ran (default=1)') 22 | 23 | def main(name=None, config_path=None): 24 | ID = str(uuid.uuid4()) 25 | 26 | logger.info(f"This performance run's ID is {ID} with name={name}", True) 27 | 28 | # Get the current script path 29 | current_script_path = os.path.dirname(os.path.abspath(__file__)) 30 | 31 | if config_path == None: 32 | raise Exception(f"please provide a path to a test config file (json)") 33 | elif os.path.isfile(str(config_path)) == False: 34 | logger.error(f"[{ID}] Config path {config_path} does not exist! Existing...", True) 35 | sys.exit(1) 36 | 37 | config = util.read_json(config_path) 38 | logger.info(f"[{ID}] Loaded config file {config_path} for this benchmark run, with the following configuration: {config}", True) 39 | 40 | # NOTE: make sure a python environment named "env" is created in the same repo as this script 41 | env_path = os.path.join(current_script_path, "env/bin/python3") 42 | if os.path.isfile(env_path) == False: 43 | logger.critical(f"[{ID}] python environment {env_path} does not exist, please create it!", True) 44 | sys.exit(1) 45 | 46 | # TODO: (11-24-2023) this is commented out because not every LLM framework uses HuggingFace or the same model name(s) 47 | # TODO: (11-24-2023) a solution for this needs to be found or this needs to get ripped out entirly 48 | # logger.info(f"[{ID}] checking if model exists and is downloaded locally...", True) 49 | # local_hf_exists = hf.get_hf_model(str(config["model"])) 50 | # if local_hf_exists == False: 51 | # logger.critical(f"[{ID}] failed to download model {config['model']}, please look into this, existing...", True) 52 | # sys.exit(1) 53 | 54 | ################################################################################################ 55 | 56 | logger.info(f"[{ID}] Starting metrics collector...", True) 57 | try: 58 | collecting_process = subprocess.Popen([env_path, os.path.join(current_script_path, "metrics.py"), 59 | '--time-delay', str(config["time_delay"]), 60 | "--uuid", str(ID) 61 | ]) 62 | logger.info(f"[{ID}] the metrics collector is running with a PID of {collecting_process.pid}", True) 63 | except Exception as err: 64 | logger.error(f"[{ID}] failed to run metric collector due to error: {err}, so existing...", True) 65 | sys.exit(1) 66 | 67 | ################################################################################################ 68 | 69 | logger.info(f"[{ID}] Initiated {config['model_start_pause']} second pre model start to gather hardware metrics BEFORE the model is activated", True) 70 | time.sleep(config["model_start_pause"]) 71 | 72 | ################################################################################################ 73 | 74 | logger.info(f"[{ID}] Activating model {config['model']} with following parameters: {str(config)}", True) 75 | try: 76 | model_running_process = subprocess.Popen([env_path, os.path.join(current_script_path, "model.py"), 77 | "--framework", str(config.get("framework")), 78 | "--max_length", str(config.get("max_length")), 79 | "--temperature", str(config.get("temperature")), 80 | "--top_k", str(config.get("top_k")), 81 | "--top_p", str(config.get("top_p")), 82 | "--num_return_sequences", str(config.get("num_return_sequences")), 83 | "--uuid", str(ID), 84 | "--prompt", str(config.get("prompt")), 85 | "--model", str(config.get("model")), 86 | "--device", str(config.get("device")), 87 | "--dtype", str(config.get("dtype")) 88 | ]) 89 | logger.info(f"[{ID}] model {config['model']} is running with a PID of {model_running_process.pid}", True) 90 | except Exception as err: 91 | logger.error(f"[{ID}] failed to run model {config['model']} due to error: {err}", True) 92 | logger.error(f"[{ID}] attempting to kill metrics collector due to model failing to run", True) 93 | collecting_process.send_signal(signal.SIGTERM) 94 | collecting_process.wait() 95 | sys.exit(1) 96 | 97 | ################################################################################################ 98 | 99 | logger.info(f"[{ID}] waiting for model {config['model']} to finish running...", True) 100 | model_running_process.wait() 101 | logger.info(f"[{ID}] model {config['model']} finished running! no longer waiting!", True) 102 | 103 | logger.info(f"[{ID}] Initiated {config['model_start_pause']} second post model end to gather hardware metrics AFTER the model has completed it's run time", True) 104 | time.sleep(config["model_end_pause"]) 105 | 106 | logger.info(f"[{ID}] Kill signal has been sent to metrics collector, is should finish running soon...", True) 107 | collecting_process.send_signal(signal.SIGTERM) 108 | collecting_process.wait() 109 | 110 | exported_files_paths = util.get_id_files(ID, current_script_path) 111 | if len(exported_files_paths) != 2: 112 | logger.critical(f"[{ID}] The metrics-collector and model have completed their runs BUT there are only {len(exported_files_paths)} exported data files NOT 2, look into this, existing...", True) 113 | sys.exit(1) 114 | 115 | # get full file paths for metrics data file & model data file 116 | metrics_data = None 117 | model_data = None 118 | for file in exported_files_paths: 119 | if "_metrics.json" in file: 120 | metrics_data = file 121 | elif "_model.json" in file: 122 | model_data = file 123 | else: 124 | logger.critical(f"[{ID}] Of the expected data output files, this file has an unexpected file 'extension': {file}", True) 125 | sys.exit(1) 126 | 127 | # create reports/ directory if it does not exist 128 | reports_path = os.path.join(current_script_path, "reports") 129 | if not os.path.exists(reports_path): 130 | os.makedirs(reports_path) 131 | 132 | # build filepath for final report file 133 | final_data_path = f'report_{datetime.now(timezone.utc).strftime("%Y-%m-%d_%H-%M-%S.%f_utc")}_{ID}.json' 134 | if name != None: 135 | final_data_path = f"{name}_{final_data_path}" 136 | final_data_path = os.path.join(reports_path, final_data_path) 137 | 138 | final_dataset = { 139 | "model": util.read_json(model_data), 140 | "test_env": { 141 | "params": config, 142 | "commit": util.get_current_commit(), 143 | "hardware": hw.get_all(static_only=True) 144 | }, 145 | "metric": util.read_json(metrics_data) 146 | } 147 | 148 | # export file data/results 149 | util.write_json(final_data_path, final_dataset) 150 | 151 | # delete exported data files from metrics-collector and model-runner 152 | # NOTE: we have to be careful here 153 | util.delete_file(model_data) 154 | util.delete_file(metrics_data) 155 | 156 | logger.warning(f"[{ID}] Deleted exported sub-data files: {model_data} & {metrics_data}", True) 157 | logger.info(f"[{ID}] ==> Created final report from this performance benchmark to file: {final_data_path}", True) 158 | 159 | # TODO: returning the final output data's filepath for now 160 | return final_data_path 161 | 162 | if __name__ == "__main__": 163 | args = parser.parse_args() 164 | 165 | loops = int(args.loops) 166 | if loops < 1: 167 | raise Exception(f"loops MOST be greater then or equal to 1!") 168 | 169 | # single benchmark run 170 | if loops <= 1: 171 | start_time = time.time() 172 | main(name=args.name, config_path=args.config) 173 | runtime = time.time() - start_time 174 | logger.info(f"(single) Total Runtime: {runtime} seconds", True) 175 | sys.exit(0) 176 | 177 | # multiple benchmark runs 178 | start_time = time.time() 179 | all_filepaths = [] 180 | for i in range(int(args.loops)): 181 | i_name = f"run_{i}" 182 | if args.name != None: 183 | i_name = f"{args.name}_{i_name}" 184 | logger.info(f"Run {i+1}/{args.loops} for performance benchmark", True) 185 | filepath = main(name=i_name, config_path=args.config) 186 | all_filepaths.append(filepath) 187 | logger.info(f"==> Muli-Run completed for performance benchmark. A total of {args.loops} runs we done and the following data was exported: {all_filepaths}", True) 188 | runtime = time.time() - start_time 189 | logger.info(f"(multiple) Total Runtime: {runtime} seconds", True) 190 | 191 | sys.exit(0) 192 | -------------------------------------------------------------------------------- /src/hf.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer, AutoModelForCausalLM 2 | from huggingface_hub import snapshot_download 3 | from huggingface_hub import HfApi 4 | import torch 5 | import time 6 | import os 7 | 8 | import logger 9 | 10 | def count_tokens(model_name, text): 11 | tokenizer = AutoTokenizer.from_pretrained(model_name) 12 | encoded_input = tokenizer(text) 13 | num_tokens = len(encoded_input['input_ids']) 14 | return num_tokens 15 | 16 | def validate_options(options, valid_keys): 17 | user_keys = set(options.keys()) 18 | return user_keys.issubset(valid_keys) 19 | 20 | def str_to_torch_dtype(dtype_str): 21 | dtype_map = { 22 | 'float32': torch.float32, 23 | 'float': torch.float, 24 | 'float64': torch.float64, 25 | 'double': torch.double, 26 | 'float16': torch.float16, 27 | 'half': torch.half, 28 | 'bfloat16': torch.bfloat16, 29 | 'int8': torch.int8, 30 | 'uint8': torch.uint8, 31 | 'int16': torch.int16, 32 | 'short': torch.short, 33 | 'int32': torch.int32, 34 | 'int': torch.int, 35 | 'int64': torch.int64, 36 | 'long': torch.long, 37 | 'bool': torch.bool, 38 | 'complex64': torch.complex64, 39 | 'complex128': torch.complex128, 40 | 'cdouble': torch.cdouble, 41 | 'quint8': torch.quint8, 42 | 'qint8': torch.qint8, 43 | 'qint32': torch.qint32, 44 | 'quint4x2': torch.quint4x2 45 | } 46 | return dtype_map.get(dtype_str, None) 47 | 48 | def run_llm(model_name, input, device="", dtype="bfloat16", model_params={}): 49 | valid_model_params = {"max_length", "temperature", "top_k", "top_p", "num_return_sequences"} 50 | if model_params != {} and validate_options(model_params, valid_model_params) == False: 51 | raise Exception(f"model_params only accepts the following keys: {model_params.keys()}") 52 | 53 | dtype_torch = str_to_torch_dtype(dtype) 54 | if dtype_torch == None: 55 | raise Exception(f"{dtype} is NOT a valid dtype supported by Pytorch") 56 | 57 | # TODO: currently this function only supports one GPU, the goal will be to update this to support muliple GPU(s) 58 | if "cuda:" not in device and device != "cpu" and device != "": 59 | raise Exception(f"device can only be type cuda:N, cpu, or auto") 60 | 61 | """ 62 | November 21, 2023 63 | Default model_params for generate() function 64 | https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig 65 | """ 66 | default_model_params = { 67 | "max_length": 20, 68 | "temperature": 1.0, 69 | "top_k": 50, 70 | "top_p": 1.0, 71 | "num_return_sequences": 1 72 | } 73 | 74 | if device == "": 75 | device = "cuda:0" if torch.cuda.is_available() else "cpu" 76 | 77 | tokenizer = AutoTokenizer.from_pretrained(model_name) 78 | model = AutoModelForCausalLM.from_pretrained(model_name) 79 | model = model.to(device).to(dtype_torch) 80 | 81 | model_dtype = next(model.parameters()).dtype 82 | 83 | inputs = tokenizer(str(input), return_tensors="pt").to(device) 84 | 85 | for key in model_params: 86 | if model_params[key] != None: 87 | default_model_params[key] = model_params[key] 88 | 89 | start_time = time.time() 90 | 91 | generated_sequences = model.generate( 92 | inputs["input_ids"], 93 | max_length=default_model_params["max_length"], 94 | temperature=default_model_params["temperature"], 95 | top_k=default_model_params["top_k"], 96 | top_p=default_model_params["top_p"], 97 | num_return_sequences=default_model_params["num_return_sequences"] 98 | ) 99 | 100 | runtime = time.time() - start_time 101 | 102 | response = tokenizer.decode(generated_sequences[0]) 103 | device = generated_sequences.device 104 | 105 | tokens_in = inputs["input_ids"].size(1) * default_model_params["num_return_sequences"] 106 | tokens_out = generated_sequences.size(1) * default_model_params["num_return_sequences"] 107 | 108 | return { 109 | "model_name": model_name, 110 | "runtime_secs": runtime, 111 | "prompt": input, 112 | "response": response, 113 | "tokens": { 114 | "input": tokens_in, 115 | "output": tokens_out 116 | }, 117 | "tokens_out/sec": tokens_out / runtime, 118 | "device": str(generated_sequences.device), 119 | "model_params": { 120 | "dtype": str(model_dtype), 121 | "max_length": default_model_params["max_length"], 122 | "temperature": default_model_params["temperature"], 123 | "top_k": default_model_params["top_k"], 124 | "top_p": default_model_params["top_p"], 125 | "num_return_sequences": default_model_params["num_return_sequences"] 126 | } 127 | } 128 | 129 | # Check if hf model exists and download hf model to local disk if needed 130 | def get_hf_model(hf_repo_path): 131 | # quick & dirty way to check if a hf model/repo exists 132 | api = HfApi() 133 | try: 134 | api.list_repo_files(hf_repo_path) 135 | except Exception as err: 136 | logger.error(f"failed to check if model {hf_repo_path} exists on HuggingFace due to error: {err}") 137 | return False 138 | 139 | # quick & dirty way to check if hf model/repo has been downloaded, saved to cache directory 140 | cache_dir = os.path.join(os.path.expanduser('~'), ".cache/huggingface/hub/") 141 | model_cache_dir = os.path.join(cache_dir, "models--{}".format(hf_repo_path.replace("/", "--"))) 142 | if os.path.isdir(model_cache_dir): 143 | logger.info(f"model {hf_repo_path} already exists in directory {model_cache_dir}") 144 | return True 145 | 146 | # download hf model/repo if it's not downloaded 147 | try: 148 | snapshot_download(repo_id=hf_repo_path, repo_type="model", token=True) 149 | logger.info(f"downloaded model {hf_repo_path}") 150 | return True 151 | except Exception as err: 152 | logger.error(f"{err}") 153 | return False 154 | -------------------------------------------------------------------------------- /src/hw.py: -------------------------------------------------------------------------------- 1 | """ 2 | Function for getting hardware information/metrics in python 3 | 4 | Massive credit goes to Abdeladim Fadheli's amazing article "How to Get Hardware and System Information in Python" 5 | The following functions where based off of Fadheli's article: 6 | get_size() 7 | system_info() 8 | cpu_info() 9 | memory_info() 10 | disk_info() 11 | gpu_info() 12 | 13 | https://thepythoncode.com/article/get-hardware-system-information-python 14 | 15 | November 20, 2023 16 | """ 17 | 18 | import subprocess 19 | import psutil 20 | import platform 21 | import GPUtil 22 | import json 23 | 24 | def execute(cmd): 25 | proc = subprocess.Popen(str(cmd), shell=True, stdout=subprocess.PIPE,) 26 | output = proc.communicate()[0].decode("utf-8") 27 | return output.split("\n") 28 | 29 | def neofetch(): 30 | # requires neofetch (https://github.com/dylanaraps/neofetch) 31 | cmd = "neofetch --stdout" 32 | output = {} 33 | try: 34 | stdout = execute(cmd) 35 | for line in stdout: 36 | if ": " not in line: 37 | continue 38 | tmp = line.split(": ", 1) 39 | if len(tmp) != 2: 40 | continue 41 | key = tmp[0].lower().strip().replace(" ", "") 42 | value = tmp[1].strip() 43 | output[key] = value 44 | except: 45 | pass 46 | return output 47 | 48 | def get_size(bytes, suffix="B"): 49 | """ 50 | Scale bytes to its proper format 51 | e.g: 52 | 1253656 => '1.20MB' 53 | 1253656678 => '1.17GB' 54 | """ 55 | factor = 1024 56 | for unit in ["", "K", "M", "G", "T", "P"]: 57 | if bytes < factor: 58 | return f"{bytes:.2f}{unit}{suffix}" 59 | bytes /= factor 60 | 61 | def system_info(native=True): 62 | if native == False: 63 | return neofetch() 64 | 65 | uname = platform.uname() 66 | return { 67 | "system": uname.system, 68 | "node_name": uname.node, 69 | "release": uname.release, 70 | "version": uname.version, 71 | "machine": uname.machine, 72 | "processor": uname.processor 73 | } 74 | 75 | def cpu_info(options={}): 76 | # model_name = options.get("model") 77 | # if model_name == None: 78 | # model_name = neofetch()["cpu"] 79 | 80 | cores = None 81 | if options.get("cores") == True: 82 | cores = {} 83 | for i, percentage in enumerate(psutil.cpu_percent(percpu=True, interval=1)): 84 | cores[str(i)] = f"{percentage}%" 85 | 86 | cpufreq = psutil.cpu_freq() 87 | 88 | output = { 89 | # "model": str(model_name), 90 | "physical_cores": psutil.cpu_count(logical=False), 91 | "total_cores": psutil.cpu_count(logical=True), 92 | "max_frequency": f"{cpufreq.max:.2f}Mhz", 93 | "min_frequency": f"{cpufreq.min:.2f}Mhz", 94 | "current_frequency": f"{cpufreq.current:.2f}Mhz" 95 | } 96 | 97 | if cores != None: 98 | output["cores"] = cores 99 | 100 | return output 101 | 102 | def memory_info(): 103 | svmem = psutil.virtual_memory() 104 | swap = psutil.swap_memory() 105 | return { 106 | "total": get_size(svmem.total), 107 | "available": get_size(svmem.available), 108 | "used": get_size(svmem.used), 109 | "percentage": svmem.percent, 110 | "swap": { 111 | "total": get_size(swap.total), 112 | "free": get_size(swap.free), 113 | "used": get_size(swap.used), 114 | "usage": swap.percent 115 | } 116 | } 117 | 118 | def disk_info(): 119 | output = {} 120 | partitions = psutil.disk_partitions() 121 | for partition in partitions: 122 | try: 123 | partition_usage = psutil.disk_usage(partition.mountpoint) 124 | except PermissionError: 125 | partition_usage = "" 126 | 127 | output["partition.device"] = { 128 | "mount_point": partition.mountpoint, 129 | "file_system_type": partition.fstype 130 | } 131 | 132 | if partition_usage != "": 133 | output["partition.device"] = {**output["partition.device"], **{ 134 | "total_size": get_size(partition_usage.total), 135 | "used": get_size(partition_usage.used), 136 | "free": get_size(partition_usage.free), 137 | "usage": f"{partition_usage.percent}%" 138 | }} 139 | 140 | disk_io = psutil.disk_io_counters() 141 | return { 142 | "total_read": get_size(disk_io.read_bytes), 143 | "total_write": get_size(disk_io.write_bytes), 144 | "partitions": output 145 | } 146 | 147 | def gpu_info(raw=False): 148 | gpus = GPUtil.getGPUs() 149 | gpus_data = {} 150 | for gpu in gpus: 151 | if raw: 152 | gpus_data[gpu.id] = vars(gpu) 153 | else: 154 | gpus_data[gpu.id] = { 155 | "id": gpu.id, 156 | "uuid": gpu.uuid, 157 | "name": gpu.name, 158 | "driver": gpu.driver, 159 | "load": f"{gpu.load*100}%", 160 | "memory": { 161 | "free": f"{gpu.memoryFree}MB", 162 | "used": f"{gpu.memoryUsed}MB", 163 | "total": f"{gpu.memoryTotal}MB" 164 | }, 165 | "temp": f"{gpu.temperature} °C" 166 | } 167 | return gpus_data 168 | 169 | def get_all(static_only=False): 170 | if static_only: 171 | return { 172 | "system": system_info(), 173 | "neofetch": system_info(False) 174 | } 175 | 176 | return { 177 | "cpu": cpu_info({"cores": True}), 178 | "ram": memory_info(), 179 | "disk": disk_info(), 180 | "gpu": gpu_info() 181 | } 182 | -------------------------------------------------------------------------------- /src/llmvm.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer 2 | from llm_vm.client import Client 3 | import torch 4 | import time 5 | import sys 6 | import os 7 | import subprocess 8 | 9 | import logger 10 | 11 | ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir)) 12 | sys.path.extend([ROOT_DIR, os.path.join(ROOT_DIR, "src")]) 13 | import util 14 | 15 | """ 16 | ABOUT: 17 | This function contains the official logic used by LLM-VM to pick a device (GPUs or CPUs) 18 | NOTES: 19 | Update this function as need be, LLM-VM is constantly changing 20 | LAST-DATE: 21 | November 15, 2023 22 | SOURCE: 23 | https://github.com/anarchy-ai/LLM-VM/blob/main/src/llm_vm/onsite_llm.py 24 | ~lines 45 - 49 25 | """ 26 | def llmvm_device_picker(): 27 | device = None 28 | if torch.cuda.device_count() > 1: 29 | device = [f"cuda:{i}" for i in range(torch.cuda.device_count())] # List of available GPUs 30 | else: # If only one GPU is available, use cuda:0, else use CPU 31 | device = "cuda:0" if torch.cuda.is_available() else "cpu" 32 | return device 33 | 34 | def count_tokens(model_name, text): 35 | tokenizer = AutoTokenizer.from_pretrained(model_name) 36 | encoded_input = tokenizer(text) 37 | num_tokens = len(encoded_input['input_ids']) 38 | return num_tokens 39 | 40 | def run_llm(model_name, prompt, supported_models, model_params={}): 41 | if model_name not in supported_models: 42 | raise Exception("model {} is NOT supported in LLM-VM".format(model_name)) 43 | if type(supported_models[model_name]) != str: 44 | raise Exception("model {} is a close-sourced, API based, model".format(model_name)) 45 | if type(prompt) != str or len(prompt) == 0: 46 | raise Exception("prompt MOST be type str and have a length greater then 0") 47 | 48 | """ 49 | ABOUT: 50 | This is the default value for temperature in LLM-VM at the moment 51 | LAST-DATE: 52 | November 24, 2023 53 | SOURCE: 54 | https://github.com/anarchy-ai/LLM-VM/blob/main/src/llm_vm/client.py 55 | ~lines 109 56 | """ 57 | temp = model_params.get("temperature") 58 | if temp == None: 59 | temp = 0 60 | # TODO: remove this log when a solution is implmented 61 | logger.warning(f"currently, {run_llm.__name__}() only supports LLM-VM(s): temperature", True) 62 | 63 | client = Client(big_model=str(model_name)) 64 | 65 | start_time = time.time() 66 | 67 | # NOTE: we only accept temperatuer for now 68 | response=client.complete(prompt=prompt, temperature=temp) 69 | 70 | runtime = time.time() - start_time 71 | 72 | device = llmvm_device_picker() 73 | huggingface_path = supported_models[model_name] 74 | tokens_in = count_tokens(huggingface_path, prompt) 75 | tokens_out = count_tokens(huggingface_path, response["completion"]) 76 | 77 | return { 78 | "model_name": model_name, 79 | "model_path": huggingface_path, 80 | "runtime_secs": runtime, 81 | "prompt": prompt, 82 | "response": response, 83 | "tokens": { 84 | "input": tokens_in, 85 | "output": tokens_out 86 | }, 87 | "tokens_out/sec": tokens_out / runtime, 88 | "device": str(device), 89 | "model_params": { 90 | "temperature": temp 91 | } 92 | } 93 | 94 | -------------------------------------------------------------------------------- /src/logger.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timezone 2 | import logging 3 | import time 4 | import os 5 | 6 | colors = { 7 | 'black': '\033[30m', 8 | 'red': '\033[31m', 9 | 'green': '\033[32m', 10 | 'yellow': '\033[33m', 11 | 'blue': '\033[34m', 12 | 'magenta': '\033[35m', 13 | 'cyan': '\033[36m', 14 | 'white': '\033[37m', 15 | 'bright_black': '\033[90m', 16 | 'bright_red': '\033[91m', 17 | 'bright_green': '\033[92m', 18 | 'bright_yellow': '\033[93m', 19 | 'bright_blue': '\033[94m', 20 | 'bright_magenta': '\033[95m', 21 | 'bright_cyan': '\033[96m', 22 | 'bright_white': '\033[97m', 23 | 'reset': '\033[0m', 24 | } 25 | 26 | # determine and set the path for the log file 27 | filepath = os.path.dirname(os.path.abspath(__file__)) 28 | filepath = "/".join(filepath.split("/")[:-1]) 29 | filepath = os.path.join(filepath, "events.log") 30 | 31 | logging.basicConfig(level=logging.DEBUG, 32 | format='%(asctime)s %(filename)s:%(funcName)s:%(lineno)d [%(levelname)s] %(message)s', 33 | datefmt='%m/%d/%Y %H:%M:%S %z', 34 | filename=filepath, 35 | filemode='a') 36 | 37 | logging.Formatter.converter = time.gmtime 38 | 39 | def logger_printer(msg, sev=None): 40 | sev = str(sev).lower() 41 | 42 | timestamp = datetime.now(timezone.utc).strftime("%m-%d-%Y %H:%M:%S.%f UTC") 43 | timestamp = f"{timestamp} ({datetime.utcnow().timestamp()})" 44 | 45 | sev_val = "" 46 | color_val = "" 47 | end_val = colors.get("reset", "") 48 | if sev == "info" or sev == "1": 49 | color_val = colors.get("green", "") 50 | sev_val = "INFO" 51 | elif sev == "warning" or sev == "2": 52 | color_val = colors.get("yellow", "") 53 | sev_val = "WARNING" 54 | elif sev == "error" or sev == "3": 55 | color_val = colors.get("magenta", "") 56 | sev_val = "ERROR" 57 | elif sev == "critical" or sev == "4": 58 | color_val = colors.get("red", "") 59 | sev_val = "CRITICAL" 60 | else: 61 | color_val = None 62 | sev_val = "NOTSET" 63 | 64 | if color_val == None: 65 | color_val = "" 66 | end_val = "" 67 | 68 | print(f"{color_val}{timestamp} [{sev_val}] - {msg}{end_val}" ) 69 | 70 | def info(msg, print_it=False): 71 | logging.info(msg) 72 | if print_it: 73 | logger_printer(msg, 1) 74 | 75 | def warning(msg, print_it=False): 76 | logging.warning(msg) 77 | if print_it: 78 | logger_printer(msg, 2) 79 | 80 | def error(msg, print_it=False): 81 | logging.error(msg) 82 | if print_it: 83 | logger_printer(msg, 3) 84 | 85 | def critical(msg, print_it=False): 86 | logging.critical(msg) 87 | if print_it: 88 | logger_printer(msg, 4) 89 | 90 | -------------------------------------------------------------------------------- /src/pointer.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import util 4 | 5 | ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir)) 6 | 7 | SUPPORTED_FRAMEWORKS = [ 8 | "huggingface", 9 | "llm-vm" 10 | ] 11 | 12 | # NOTE: are of 11-24-2023 these are the current models supported in LLM-VM 13 | llmvm_supported_models = { 14 | "pythia": "EleutherAI/pythia-70m-deduped", 15 | "opt": "facebook/opt-350m", 16 | "bloom": "bigscience/bloom-560m", 17 | "neo": "EleutherAI/gpt-neo-1.3B", 18 | "smallorca": "Open-Orca/LlongOrca-7B-16k", 19 | "orca": "Open-Orca/LlongOrca-13B-16k", 20 | "mistral": "Open-Orca/Mistral-7B-OpenOrca", 21 | "platypus": "Open-Orca/OpenOrca-Platypus2-13B", 22 | "llama": "openlm-research/open_llama_3b_v2", 23 | "llama2": "meta-llama/Llama-2-7b-hf", 24 | "codellama-7b": "codellama/CodeLlama-7b-hf", 25 | "codellama-13b": "codellama/CodeLlama-13b-hf", 26 | "codellama-34b": "codellama/CodeLlama-34b-hf", 27 | "flan": "google/flan-t5-small", 28 | "bert": None, 29 | "gpt": None, 30 | "gpt4": None, 31 | "chat_gpt": None, 32 | "quantized-llama2-7b-base": "TheBloke/Llama-2-7B-GGML", 33 | "quantized-llama2-13b-base": "TheBloke/Llama-2-13B-GGML", 34 | "llama2-7b-chat-Q4": "TheBloke/Llama-2-7B-Chat-GGML", 35 | "llama2-7b-chat-Q6": "TheBloke/Llama-2-7B-Chat-GGML", 36 | "llama2-13b-chat-Q4": "TheBloke/Llama-2-13B-Chat-GGML", 37 | "llama2-13b-chat-Q6": "TheBloke/Llama-2-13B-Chat-GGML", 38 | "llama2-7b-32k-Q4": "TheBloke/Llama-2-7B-32K-Instruct-GGML" 39 | } 40 | 41 | def execute_llm(framework, model_name=None, prompt=None, device=None, dtype=None, model_params={}): 42 | if framework not in SUPPORTED_FRAMEWORKS: 43 | raise Exception(f"framework {framework} is not supported!") 44 | 45 | if framework == "huggingface": 46 | import hf 47 | return hf.run_llm(model_name=model_name, input=prompt, device=device, dtype=dtype, model_params=model_params) 48 | 49 | if framework == "llm-vm": 50 | import llmvm 51 | return llmvm.run_llm(model_name=model_name, prompt=prompt, model_params={}, supported_models=llmvm_supported_models) 52 | 53 | raise Exception(f"the logic is off in {execute_llm.__name__}()") 54 | -------------------------------------------------------------------------------- /src/util.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import json 3 | import os 4 | 5 | import logger 6 | 7 | def execute(cmd): 8 | proc = subprocess.Popen(str(cmd), shell=True, stdout=subprocess.PIPE,) 9 | output = proc.communicate()[0].decode("utf-8") 10 | return output.split("\n") 11 | 12 | def read_file(path): 13 | with open(str(path)) as file: 14 | content = file.readlines() 15 | content = [i.strip() for i in content] 16 | return content 17 | 18 | def write_file(path, data): 19 | file = open(str(path), "w") 20 | for line in data: 21 | file.write(str(line) + "\n") 22 | file.close() 23 | 24 | def read_json(path): 25 | with open(str(path)) as file: 26 | content = json.load(file) 27 | return content 28 | 29 | def write_json(path, data): 30 | with open(str(path), "w") as file: 31 | # NOTE: ensure_ascii will write unicode characters as they are 32 | json.dump(data, file, indent=4, ensure_ascii=False) 33 | 34 | def create_file(path): 35 | file = open(str(path), "a+") 36 | file.close() 37 | 38 | def get_id_files(id, dir_path): 39 | if os.path.exists(dir_path) == False or os.path.isdir(dir_path) == False: 40 | raise Exception(f"dir path {dir_path} does not exist!") 41 | files = [] 42 | for f in os.listdir(dir_path): 43 | full_path = os.path.join(dir_path, f) 44 | if os.path.isfile(full_path) and (str(id) in f) and (".json" in full_path): 45 | files.append(full_path) 46 | return files 47 | 48 | def delete_file(file_path): 49 | if os.path.isfile(file_path): 50 | try: 51 | os.remove(file_path) 52 | logger.info(f"deleted file {file_path}") 53 | except Exception as err: 54 | logger.error(f"error! failed to delete file {file_path}") 55 | else: 56 | logger.info(f"file {file_path} does not exist") 57 | 58 | def get_current_commit(): 59 | try: 60 | cmd = "git log | head -n 1 | awk '{print $2}'" 61 | output = execute(cmd) 62 | return output[0] 63 | except: 64 | return "" 65 | -------------------------------------------------------------------------------- /tools/columns.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import fnmatch 4 | import csv 5 | import time 6 | 7 | def read_json(path): 8 | with open(str(path)) as file: 9 | content = json.load(file) 10 | return content 11 | 12 | def find_json_files(directory, deep_search=False): 13 | if deep_search == False: 14 | matches = [] 15 | for file in os.listdir(directory): 16 | if file.endswith('.json'): 17 | matches.append(os.path.join(directory, file)) 18 | return matches 19 | 20 | matches = [] 21 | for root, dirnames, filenames in os.walk(directory): 22 | for filename in fnmatch.filter(filenames, '*.json'): 23 | matches.append(os.path.join(root, filename)) 24 | return matches 25 | 26 | # https://stackoverflow.com/a/34311071 27 | def save_csv(filepath, data): 28 | filepath = str(filepath) 29 | if filepath.endswith(".csv") == False: 30 | raise Exception(f"filepath MOST end with .csv") 31 | with open(str(filepath), "w") as f: 32 | wr = csv.DictWriter(f, delimiter="\t",fieldnames=list(data[0].keys())) 33 | wr.writeheader() 34 | wr.writerows(data) 35 | 36 | # main function calls 37 | 38 | reports_path = os.path.join(os.path.abspath(os.path.join(os.getcwd(), os.pardir)), "reports") 39 | 40 | json_files_paths = find_json_files(reports_path) 41 | 42 | output = [] 43 | for path in json_files_paths: 44 | print(f"Processing file: {path}") 45 | 46 | data = read_json(path) 47 | 48 | max_gpu_memory_usage = -1 49 | min_gpu_memory_usage = 1000000000000000000000000 50 | for key in list(data["metric"].keys()): 51 | # TODO: this currently only assumes the RAM usage is in MB, this needs to be fixed 52 | memory_usage = float(data["metric"][key]['gpu']['0']['memory']['used'].replace('MB', '')) / 1024 53 | if memory_usage > max_gpu_memory_usage: 54 | max_gpu_memory_usage = memory_usage 55 | if memory_usage < min_gpu_memory_usage: 56 | min_gpu_memory_usage = memory_usage 57 | 58 | tmp = { 59 | **data["test_env"]["params"], 60 | "runtime_sec": data["model"]["runtime_secs"], 61 | "tokens/sec": data["model"]["tokens_out/sec"], 62 | "tokens_in": data["model"]["tokens"]["input"], 63 | "tokens_out": data["model"]["tokens"]["output"], 64 | "gpu": data["metric"][list(data["metric"].keys())[0]]["gpu"]["0"]["name"], 65 | "max_gpu_memory_usage": max_gpu_memory_usage, 66 | "min_gpu_memory_usage": min_gpu_memory_usage, 67 | "file": path.split("/")[-1] 68 | } 69 | 70 | del tmp["prompt"] 71 | del tmp["model_start_pause"] 72 | del tmp["model_end_pause"] 73 | 74 | output.append(tmp) 75 | 76 | csv_filename = f"results_{time.time()}.csv" 77 | save_csv(csv_filename, output) 78 | print(f"Created CSV file: {csv_filename}") 79 | 80 | -------------------------------------------------------------------------------- /tools/graph.py: -------------------------------------------------------------------------------- 1 | # NOTE: (11-24-2023) most of this code was generated by ChatGPT (GPT-4). It is a quick solution and needs refinement 2 | 3 | import matplotlib.pyplot as plt 4 | import matplotlib.dates as mdates 5 | from datetime import datetime 6 | import json 7 | import os 8 | import glob 9 | 10 | def pick_json_file(parent_directory): 11 | # list all files in the parent directory that have a .json extension. 12 | json_files = glob.glob(os.path.join(parent_directory, 'reports/*.json')) 13 | json_files = [string for string in json_files if "report" in string] 14 | 15 | # display the JSON files. 16 | print("Please select a JSON file to load:") 17 | for idx, file_name in enumerate(json_files): 18 | print(f"{idx}: {file_name}") 19 | 20 | # prompt the user to select a file by number. 21 | selected_index = int(input("Enter the number of the JSON file you want to read: ")) 22 | selected_file_path = json_files[selected_index] 23 | 24 | # read the selected JSON file. 25 | with open(selected_file_path) as file: 26 | content = json.load(file) 27 | 28 | return content 29 | 30 | # load data 31 | parent_directory = os.path.abspath(os.path.join(os.getcwd(), os.pardir)) 32 | data = pick_json_file(parent_directory) 33 | 34 | # TODO: (11-24-2023) for now, we will focus on GPU metrics 35 | data = data["metric"] 36 | 37 | timestamps = [] 38 | memory_usages = [] 39 | for epoch, info in data.items(): 40 | # convert epoch to datetime for plotting 41 | timestamp = datetime.fromtimestamp(float(epoch)) 42 | # get GPU memory usage and convert it to GB 43 | memory_usage = float(info['gpu']['0']['memory']['used'].replace('MB', '')) / 1024 44 | 45 | timestamps.append(timestamp) 46 | memory_usages.append(memory_usage) 47 | 48 | # plotting 49 | plt.figure(figsize=(10, 5)) 50 | plt.plot(timestamps, memory_usages, marker='o') 51 | plt.xlabel('Epoch Time (seconds)') 52 | plt.ylabel('GPU Memory Usage (GB)') 53 | plt.title('GPU Memory Usage Over Time') 54 | plt.grid(True) 55 | plt.tight_layout() 56 | 57 | plt.show() 58 | --------------------------------------------------------------------------------