├── .gitignore
├── .vscode
    └── settings.json
├── LICENSE
├── README.md
├── assets
    ├── docs
    │   ├── logo.png
    │   └── sample_results
    │   │   ├── Llama-2-7b-hf_L40.json
    │   │   ├── codellama-13b-oasst-sft-v10_H100.json
    │   │   └── mpt-7b_L40.json
    └── setup
    │   └── runpod.sh
├── configs
    ├── hf_test.json
    └── llmvm_test.json
├── metrics.py
├── model.py
├── run.py
├── src
    ├── hf.py
    ├── hw.py
    ├── llmvm.py
    ├── logger.py
    ├── pointer.py
    └── util.py
└── tools
    ├── columns.py
    └── graph.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 
162 | # project specific
163 | .DS_Store
164 | LLM-VM/
165 | finetuned_models/
166 | assets/sandbox/
167 | *_metrics.json
168 | *_model.json
169 | report_*.json
170 | results/
171 | events.log
172 | *run_*.json
173 | reports/
174 | tools/*.csv
175 | 
176 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "python.analysis.extraPaths": [
3 |         "./src/LLM-VM/src"
4 |     ]
5 | }


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Mehmet
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # LLM Speed Benchmark (LLMSB)
  2 | 
  3 | <p align="center">
  4 |   <img width="300" src="./assets/docs/logo.png">
  5 | </p>
  6 | 
  7 | 🚧 LLM Speed Benchmark (LLMSB) is currently in beta (v0). Please do not use this in production, or use it at your own risk. We're still ironing out some kinks and improving functionality. If you encounter any bugs or have suggestions, kindly report them under [ISSUES](https://github.com/anarchy-ai/benchllm/issues). Your feedback is invaluable!
  8 | 
  9 | ## About
 10 | 
 11 | LLM Speed Benchmark (LLMSB) is a benchmarking tool for assessing LLM models' performance across different hardware platforms. Its ultimate goal is to compile a comprehensive dataset detailing LLM models' performance on various systems, enabling users to more effectively choose the right LLM model(s) for their projects.
 12 | 
 13 | ## Limtations
 14 | 
 15 | LLMSB is on v0, so it has limitations:
 16 | - Only designed to run on debian based operating systems, aka it's not designed to run on Windows. This is because LLMSB uses neofetch and nvidia-smi to gather metrics under the hood and the filepath logic is based on unix operating systems.
 17 | - Due to how metrics are recorded, it can take the metrics collector up to 1 second to do a collection. This means that, at the fast, we can collect hardware metrics every 1 second.
 18 | - LLMSB only uses HuggingFace to load and run models. This works for now, but the goal is to have LLMSB support muliple frameworks, not just HuggingFace.
 19 | - Currently, all models are ran though the logic presented in the run_llm() function, located in src/hf.py, where the functions AutoTokenizer() and AutoModelForCausalLM() are used to load and run a model. This works but it limits how we can config/optmize specific models. Knowing this, the goal is to create seperate classes for each popular model and utilize HuggingFace's model specifc classes, like LlamaTokenizer & LlamaForCausalLM, instead.
 20 | - LLMSB only gathers general, high level, metrics. In the future, we would like to gather lower level metrics. We think this can partly be done using Pytorch's [porfiler wrapper](https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html).
 21 | 
 22 | ## Sample Outputs
 23 | 
 24 | ### November 22, 2023
 25 | 
 26 | LLMSB was ran/test on a L40 and H100 GPU though [RunPod](https://www.runpod.io/). In those benchmarks the models [llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf), [codellama-13b-oasst-sft-v10](https://huggingface.co/OpenAssistant/codellama-13b-oasst-sft-v10), & [mpt-7b](https://huggingface.co/mosaicml/mpt-7b) where tested.
 27 | 
 28 | Checkout the results [HERE](https://github.com/anarchy-ai/llm-speed-benchmark/tree/main/assets/docs/sample_results). If any errors/issues are noticed, please repport them to ISSUES. 
 29 | 
 30 | ## Setup
 31 | 
 32 | 1. Create and activate python environment:
 33 |     ```
 34 |     python3 -m venv env
 35 |     source env/bin/activate
 36 |     ```
 37 | 
 38 | 2. Install package dependencies (using APT):
 39 |     ```
 40 |     apt -y update
 41 |     apt install -y vim
 42 |     apt install -y neofetch
 43 |     ```
 44 | 
 45 | 3. Install python dependencies:
 46 |     ```
 47 |     pip3 install transformers
 48 |     pip3 install psutil
 49 |     pip3 install gputil
 50 |     pip3 install tabulate
 51 |     pip3 install sentencepiece
 52 |     pip3 install protobuf
 53 |     ```
 54 | 
 55 | 4. Install Pytorch (to determine how to install Pytorch for your system, checkout their tool on: https://pytorch.org/):
 56 |     ```
 57 |     # install pytorch stable build, for linux, using CUDA 12.1:
 58 |     pip3 install torch torchvision torchaudio
 59 |     ```
 60 | 
 61 | 4. Install [LLM-VM](https://github.com/anarchy-ai/LLM-VM/tree/main):
 62 |     ```
 63 |     pip install llm-vm
 64 |     ```
 65 | 
 66 | 5. (optional) If you are using models like LLAMA, you will need a HuggingFace access token. Setup your access token [HERE](https://huggingface.co/settings/tokens) then save your token to your console by running the following command:
 67 |     ```
 68 |     huggingface-cli login
 69 |     ```
 70 | 
 71 | ## How To Run
 72 | 
 73 | 1. Complete the steps listed in the __Setup__ section.
 74 | 
 75 | 2. To configure your set, you need to create a json file with the following parameters (here is an example):
 76 |     - NOTE: not every framework supports the same parameters
 77 |     ```
 78 |     {
 79 |       "model": "bigscience/bloom-560m",   # the model's path/repo on HuggingFace (https://huggingface.co/models)
 80 |       "prompt": "Hello World!",           # the prompt you want to input into the LLM model
 81 |       "device": "cuda:0",                 # the device you want to run the LLM model on (GPU/CPU)
 82 |       "max_length": 50,                   # the maximun length of the generated tokens
 83 |       "temperature": 0.9,                 # temperatue value for the LLM model
 84 |       "top_k": 50,                        # top-k value for the LLM model
 85 |       "top_p": 0.9,                       # top-p value for the LLM model
 86 |       "num_return_sequences": 1,          # the number of independently ran instances of the model
 87 |       "time_delay": 0,                    # the time delay (seconds) the metrics-collecter will wait per interation
 88 |       "model_start_pause": 1,             # the time (seconds) the test will wait BEFORE running the LLM model
 89 |       "model_end_pause": 1                # the time (seconds) the test will wait AFTER the LLM model is done running,
 90 |       "framework": "llm-vm"               # the name of the framework/library you want to use to run the model
 91 |     } 
 92 |     ```
 93 | 
 94 | 3. Using the path to the config file you create in the previous step, run the following to start the benchmark (pick one option):
 95 |     ```
 96 |     # run one benchmark
 97 |     python3 run.py --config ./configs/llmvm_test.json
 98 | 
 99 |     # run more then one benchmark (in this case 3)
100 |     python3 run.py --config ./configs/llmvm_test.json --loops 3
101 |     ```
102 | 
103 | 4. After the benchmark is done running, check out the final results in a file that should look something like this:
104 |     ```
105 |     report_2023-11-25_05:55:04.207515_utc_1ffc4fa7-3aa9-4878-b874-1ff445e1ff8a.json
106 |     ```
107 | 
108 | ## Setting Up RunPod:
109 | 
110 | 1. Setup RunPod, setup your ssh cert/key, and get a pod running. You can access your pod(s) here: https://www.runpod.io/console/pods
111 | 
112 | 2. Click the "Connect" button to get the ssh connection info. This info should look something like this:
113 |     ```
114 |     ssh root&12.345.678.90 -p 12345 -i ~/.ssh/id_example
115 |     ```
116 |     - This commad will be formated like this:
117 |         ```
118 |         ssh <user>@<ip-address> -p <port> -i <local-path-to-ssh-cert>
119 |         ```
120 | 
121 | 3. Using the command in step #2, you should be able to ssh into the pod and use the GPU you selected in that RunPod pod. 
122 | 
123 | 4. If you want to copy a file from the pod to your local machine, you would run command in this format (this is refering to the variables shown in step #2):
124 |     ```
125 |     scp -P <port> -i <local-path-to-ssh-cert> <user>@<ip-address>:<path-to-file-in-pod> <path-to-local-directory>
126 |     ```
127 |     - Here is an example of such a command:
128 |         ```
129 |         scp -P 12345 -i ~/.ssh/id_example <user>@<ip-address>:/root/test.txt /home/user1/Downloads/
130 |         ```
131 | 
132 | 5. After you are done with the pod, shut it down or pause it. But warning, if you pause it you will still get charged, just way less. 
133 | 
134 | ## Great Sources:
135 | 
136 | - Great datasets of prompts (if you can't come up with any):
137 |   - https://github.com/f/awesome-chatgpt-prompts/tree/main
138 |   - https://huggingface.co/datasets/bigscience/P3
139 |   - https://www.kaggle.com/datasets/ratthachat/writing-prompts
140 | 
141 | - Learn more about LLM parameters: https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig
142 | 
143 | - Great benchmark to benchmark cloud-based LLM models: https://github.com/ray-project/llmperf
144 | 
145 | - Cool LLM intelligence leadboards:
146 |     - https://fasteval.github.io/FastEval/
147 |     - https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard
148 | 


--------------------------------------------------------------------------------
/assets/docs/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anarchy-ai/llm-speed-benchmark/2b2917c390074fba879bf5090139960d51999561/assets/docs/logo.png


--------------------------------------------------------------------------------
/assets/setup/runpod.sh:
--------------------------------------------------------------------------------
 1 | # Website: https://www.runpod.io/console/pods
 2 | # Service: RunPod
 3 | 
 4 | apt -y update
 5 | apt install -y vim
 6 | apt install -y neofetch
 7 | 
 8 | pip3 install transformers
 9 | pip3 install psutil
10 | pip3 install gputil
11 | pip3 install tabulate
12 | pip3 install torch torchvision torchaudio
13 | pip3 install matplotlib
14 | pip3 install sentencepiece
15 | pip3 install protobuf
16 | 
17 | 
18 | pip install llm-vm
19 | 
20 | huggingface-cli login
21 | 
22 | 


--------------------------------------------------------------------------------
/configs/hf_test.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": "facebook/opt-iml-max-1.3b",
 3 |     "prompt": "Hello World!",
 4 |     "device": "cuda:0",
 5 |     
 6 |     "max_length": 1000,
 7 |     "temperature": 0.9,
 8 |     "top_k": 50,
 9 |     "top_p": 0.9,
10 |     "num_return_sequences": 1,
11 | 
12 |     "time_delay": 0,
13 |     "model_start_pause": 1,
14 |     "model_end_pause": 1,
15 |     "dtype": "bfloat16",
16 |     "framework": "huggingface"
17 | }


--------------------------------------------------------------------------------
/configs/llmvm_test.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model": "neo",
 3 |     "prompt": "QUESTION: what is your goal in life ANSWER: ",
 4 |     
 5 |     "max_length": null,
 6 |     "temperature": 0.9,
 7 |     "top_k": null,
 8 |     "top_p": null,
 9 |     "num_return_sequences": null,
10 | 
11 |     "time_delay": 0,
12 |     "model_start_pause": 1,
13 |     "model_end_pause": 1,
14 |     "dtype": null,
15 |     "framework": "llm-vm"
16 | }
17 | 


--------------------------------------------------------------------------------
/metrics.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import os
 3 | import sys
 4 | import json
 5 | import signal
 6 | import argparse
 7 | import uuid
 8 | 
 9 | sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), "src"))
10 | import hw
11 | import logger
12 | 
13 | # config arguments
14 | parser = argparse.ArgumentParser(description='run hardware performance/metrics collector')
15 | parser.add_argument('--time-delay', type=int, default=1, help='the time dely, in seconds, for each collection interation')
16 | parser.add_argument('--uuid', type=str, default=str(uuid.uuid4()), help='the UUID for the collection')
17 | 
18 | # global variable to determine when the collection loop should stop
19 | running = True
20 | 
21 | # signal handler
22 | def signal_handler(signum, frame):
23 |     global running
24 |     running = False
25 | 
26 | if __name__ == "__main__":
27 |     args = parser.parse_args()
28 | 
29 |     signal.signal(signal.SIGTERM, signal_handler)
30 |     signal.signal(signal.SIGINT, signal_handler)
31 | 
32 |     logger.info(f"{args.uuid} - metrics collection has started...")
33 | 
34 |     metrics = {}
35 |     counter = 0
36 |     while running:
37 |         timestamp = str(time.time())
38 |         metrics[timestamp] = hw.get_all()
39 |         logger.info(f"{args.uuid} - metrics collector - Collected metrics for the {counter+1} time, now waiting for {args.time_delay} sec")
40 |         counter += 1
41 |         time.sleep(args.time_delay)
42 | 
43 |     logger.info(f"{args.uuid} - metrics collecton has concluded!")
44 | 
45 |     filepath = f"{args.uuid}_metrics.json"
46 |     with open(str(filepath), "w") as file:
47 |         json.dump(metrics, file, indent=4)
48 | 
49 |     logger.info(f"{args.uuid} - metrics collector - Saved {len(metrics.keys())} data points to file {filepath}")
50 | 
51 | 


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import time
  3 | import json
  4 | import os
  5 | import sys
  6 | import copy
  7 | import gc
  8 | import signal
  9 | import argparse
 10 | import uuid
 11 | 
 12 | sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), "src"))
 13 | import logger
 14 | import pointer
 15 | 
 16 | def float_or_none(value):
 17 |     if value.lower() == 'none':
 18 |         return None
 19 |     try:
 20 |         return float(value)
 21 |     except ValueError:
 22 |         raise argparse.ArgumentTypeError(f"{value} must be a floating point number or 'None'")
 23 |     
 24 | def int_or_none(value):
 25 |     if value.lower() == 'none':
 26 |         return None
 27 |     try:
 28 |         return int(value)
 29 |     except ValueError:
 30 |         raise argparse.ArgumentTypeError(f"{value} must be an int number or 'None'")
 31 |     
 32 | def str_or_none(value):
 33 |     if value.lower() == 'none':
 34 |         return None
 35 |     try:
 36 |         return str(value)
 37 |     except ValueError:
 38 |         raise argparse.ArgumentTypeError(f"{value} must be a string or 'None'")
 39 | 
 40 | parser = argparse.ArgumentParser(description='run llm model hosted on HuggingFace')
 41 | 
 42 | """
 43 | November 21, 2023
 44 | The default values and help values for most of these parameters were taken directly from huggingface documentation:
 45 | https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig
 46 | """
 47 | parser.add_argument('--max_length', type=int_or_none, default=20, help='The maximum length the generated tokens can have. Corresponds to the length of the input prompt + max_new_tokens. Its effect is overridden by max_new_tokens, if also set.')
 48 | parser.add_argument('--temperature', type=float_or_none, default=1.0, help='The value used to modulate the next token probabilities.')
 49 | parser.add_argument('--top_k', type=int_or_none, default=50, help='The number of highest probability vocabulary tokens to keep for top-k-filtering.')
 50 | parser.add_argument('--top_p', type=float_or_none, default=1.0, help='If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.')
 51 | parser.add_argument('--num_return_sequences', type=int_or_none, default=1, help='The number of independently computed returned sequences for each element in the batch.')
 52 | 
 53 | parser.add_argument('--uuid', type=str_or_none, default=str(uuid.uuid4()), help='The UUID for the collection')
 54 | parser.add_argument('--prompt', type=str_or_none, default="Hello World", help='Text prompt for the LLM model to respond too')
 55 | parser.add_argument('--model', type=str_or_none, default="", help='Huggingface repo/path to LLM model')
 56 | parser.add_argument('--device', type=str_or_none, default="", help='Device to run the model on, this can be "cpu" or "cuda:N"')
 57 | parser.add_argument('--dtype', type=str_or_none, default="bfloat16", help="A tensor's data type, this will effect the overall accuracy and hardware performance for a model")
 58 | parser.add_argument('--framework', type=str_or_none, default=None, help="Perfered framework to run LLM model on (huggingface, LLM-VM, etc)")
 59 | 
 60 | # signal handler
 61 | def signal_handler(signum, frame):
 62 |     sys.exit(1)
 63 | 
 64 | if __name__ == "__main__":
 65 |     signal.signal(signal.SIGTERM, signal_handler)
 66 |     signal.signal(signal.SIGINT, signal_handler)
 67 | 
 68 |     args = parser.parse_args()
 69 | 
 70 |     if args.model == "":
 71 |         logger.error(f"{args.uuid} - model not provided, please provide a model from huggingface: https://huggingface.co/models")
 72 |         sys.exit(1)
 73 | 
 74 |     logger.info(f"{args.uuid} - running model with following parameters {str(args)}")
 75 | 
 76 |     start_time = time.time()
 77 | 
 78 |     logger.info(f"{args.uuid} - model {args.model} started at epoch time {start_time} seconds")
 79 |     
 80 |     # (11-30-2023) Change this part, for finetuning or custom model running
 81 |     ##############################################################################################################
 82 | 
 83 |     try:
 84 |         output = pointer.execute_llm(args.framework, args.model, args.prompt, args.device, args.dtype, {
 85 |             "max_length": args.max_length,
 86 |             "temperature": args.temperature,
 87 |             "top_k": args.top_k,
 88 |             "top_p": args.top_p,
 89 |             "num_return_sequences": args.num_return_sequences
 90 |         })
 91 |     except Exception as err:
 92 |         logger.critical(f"{args.uuid} - existing... due to model {args.model} failed to run due to error: {err}")
 93 |         sys.exit(1)
 94 | 
 95 |     ##############################################################################################################
 96 | 
 97 |     end_time = time.time()
 98 | 
 99 |     logger.info(f"{args.uuid} - model {args.model} completed at epoch time {end_time} seconds")
100 | 
101 |     output["run_period"] = {
102 |         "started": start_time,
103 |         "ended": end_time
104 |     }
105 | 
106 |     # delete cachue and variables to free up resources for better metrics collecting
107 |     final_result = copy.deepcopy(output)
108 |     logger.info(f"{args.uuid} - calling Python's garbage collector and empting cuda cache is a GPU was used")
109 |     gc.collect()
110 |     del output
111 |     if "cuda" in args.model:
112 |         torch.cuda.empty_cache()
113 | 
114 |     filepath = f"{args.uuid}_model.json"
115 |     with open(str(filepath), "w") as file:
116 |         json.dump(final_result, file, indent=4)
117 |     
118 |     logger.info(f"{args.uuid} - model running - saved output for model run to file {filepath}")
119 | 


--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime, timezone
  2 | import subprocess
  3 | import argparse
  4 | import time
  5 | import os
  6 | import signal
  7 | import uuid
  8 | import sys
  9 | import json
 10 | 
 11 | sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), "src"))
 12 | import logger
 13 | import util
 14 | import hf
 15 | import hw
 16 | 
 17 | # config arguments
 18 | parser = argparse.ArgumentParser(description='Run performance benchmark for an LLM model')
 19 | parser.add_argument('--name', type=str, default=None, help='name of this performance benchmark run')
 20 | parser.add_argument('--config', type=str, default=None, help='path to config file that will be used for the performance benchmark')
 21 | parser.add_argument('--loops', type=int, default=1, help='number of times the performance benchmark will be ran (default=1)')
 22 | 
 23 | def main(name=None, config_path=None):
 24 |     ID = str(uuid.uuid4())
 25 | 
 26 |     logger.info(f"This performance run's ID is {ID} with name={name}", True)
 27 | 
 28 |     # Get the current script path
 29 |     current_script_path = os.path.dirname(os.path.abspath(__file__))
 30 | 
 31 |     if config_path == None:
 32 |         raise Exception(f"please provide a path to a test config file (json)")
 33 |     elif os.path.isfile(str(config_path)) == False:
 34 |         logger.error(f"[{ID}] Config path {config_path} does not exist! Existing...", True)
 35 |         sys.exit(1)
 36 | 
 37 |     config = util.read_json(config_path)
 38 |     logger.info(f"[{ID}] Loaded config file {config_path} for this benchmark run, with the following configuration: {config}", True)
 39 | 
 40 |     # NOTE: make sure a python environment named "env" is created in the same repo as this script
 41 |     env_path = os.path.join(current_script_path, "env/bin/python3")
 42 |     if os.path.isfile(env_path) == False:
 43 |         logger.critical(f"[{ID}] python environment {env_path} does not exist, please create it!", True)
 44 |         sys.exit(1)
 45 | 
 46 |     # TODO: (11-24-2023) this is commented out because not every LLM framework uses HuggingFace or the same model name(s)
 47 |     # TODO: (11-24-2023) a solution for this needs to be found or this needs to get ripped out entirly
 48 |     # logger.info(f"[{ID}] checking if model exists and is downloaded locally...", True)
 49 |     # local_hf_exists = hf.get_hf_model(str(config["model"]))
 50 |     # if local_hf_exists == False:
 51 |     #     logger.critical(f"[{ID}] failed to download model {config['model']}, please look into this, existing...", True)
 52 |     #     sys.exit(1)
 53 | 
 54 |     ################################################################################################
 55 | 
 56 |     logger.info(f"[{ID}] Starting metrics collector...", True)
 57 |     try:
 58 |         collecting_process = subprocess.Popen([env_path, os.path.join(current_script_path, "metrics.py"), 
 59 |                                             '--time-delay', str(config["time_delay"]),
 60 |                                             "--uuid", str(ID)
 61 |                                             ])
 62 |         logger.info(f"[{ID}] the metrics collector is running with a PID of {collecting_process.pid}", True)
 63 |     except Exception as err:
 64 |         logger.error(f"[{ID}] failed to run metric collector due to error: {err}, so existing...", True)
 65 |         sys.exit(1)
 66 | 
 67 |     ################################################################################################
 68 | 
 69 |     logger.info(f"[{ID}] Initiated {config['model_start_pause']} second pre model start to gather hardware metrics BEFORE the model is activated", True)
 70 |     time.sleep(config["model_start_pause"])
 71 | 
 72 |     ################################################################################################
 73 | 
 74 |     logger.info(f"[{ID}] Activating model {config['model']} with following parameters: {str(config)}", True)
 75 |     try:
 76 |         model_running_process = subprocess.Popen([env_path, os.path.join(current_script_path, "model.py"),
 77 |                                                 "--framework", str(config.get("framework")),
 78 |                                                 "--max_length", str(config.get("max_length")),
 79 |                                                 "--temperature", str(config.get("temperature")),
 80 |                                                 "--top_k", str(config.get("top_k")),
 81 |                                                 "--top_p", str(config.get("top_p")),
 82 |                                                 "--num_return_sequences", str(config.get("num_return_sequences")),
 83 |                                                 "--uuid", str(ID),
 84 |                                                 "--prompt", str(config.get("prompt")),
 85 |                                                 "--model", str(config.get("model")),
 86 |                                                 "--device", str(config.get("device")),
 87 |                                                 "--dtype", str(config.get("dtype"))
 88 |                                                 ])
 89 |         logger.info(f"[{ID}] model {config['model']} is running with a PID of {model_running_process.pid}", True)
 90 |     except Exception as err:
 91 |         logger.error(f"[{ID}] failed to run model {config['model']} due to error: {err}", True)
 92 |         logger.error(f"[{ID}] attempting to kill metrics collector due to model failing to run", True)
 93 |         collecting_process.send_signal(signal.SIGTERM)
 94 |         collecting_process.wait()
 95 |         sys.exit(1)
 96 | 
 97 |     ################################################################################################
 98 | 
 99 |     logger.info(f"[{ID}] waiting for model {config['model']} to finish running...", True)
100 |     model_running_process.wait()
101 |     logger.info(f"[{ID}] model {config['model']} finished running! no longer waiting!", True)
102 | 
103 |     logger.info(f"[{ID}] Initiated {config['model_start_pause']} second post model end to gather hardware metrics AFTER the model has completed it's run time", True)
104 |     time.sleep(config["model_end_pause"])
105 | 
106 |     logger.info(f"[{ID}] Kill signal has been sent to metrics collector, is should finish running soon...", True)
107 |     collecting_process.send_signal(signal.SIGTERM)
108 |     collecting_process.wait()
109 | 
110 |     exported_files_paths = util.get_id_files(ID, current_script_path)
111 |     if len(exported_files_paths) != 2:
112 |         logger.critical(f"[{ID}] The metrics-collector and model have completed their runs BUT there are only {len(exported_files_paths)} exported data files NOT 2, look into this, existing...", True)
113 |         sys.exit(1)
114 | 
115 |     # get full file paths for metrics data file & model data file
116 |     metrics_data = None
117 |     model_data = None
118 |     for file in exported_files_paths:
119 |         if "_metrics.json" in file:
120 |             metrics_data = file
121 |         elif "_model.json" in file:
122 |             model_data = file
123 |         else:
124 |             logger.critical(f"[{ID}] Of the expected data output files, this file has an unexpected file 'extension': {file}", True)
125 |             sys.exit(1)
126 | 
127 |     # create reports/ directory if it does not exist
128 |     reports_path = os.path.join(current_script_path, "reports")
129 |     if not os.path.exists(reports_path):
130 |         os.makedirs(reports_path)
131 | 
132 |     # build filepath for final report file
133 |     final_data_path = f'report_{datetime.now(timezone.utc).strftime("%Y-%m-%d_%H-%M-%S.%f_utc")}_{ID}.json'
134 |     if name != None:
135 |         final_data_path = f"{name}_{final_data_path}"    
136 |     final_data_path = os.path.join(reports_path, final_data_path)
137 | 
138 |     final_dataset = {
139 |         "model": util.read_json(model_data),
140 |         "test_env": {
141 |             "params": config,
142 |             "commit": util.get_current_commit(),
143 |             "hardware": hw.get_all(static_only=True)
144 |         },
145 |         "metric": util.read_json(metrics_data)
146 |     }
147 | 
148 |     # export file data/results
149 |     util.write_json(final_data_path, final_dataset)
150 | 
151 |     # delete exported data files from metrics-collector and model-runner
152 |     # NOTE: we have to be careful here
153 |     util.delete_file(model_data)
154 |     util.delete_file(metrics_data)
155 | 
156 |     logger.warning(f"[{ID}] Deleted exported sub-data files: {model_data} & {metrics_data}", True)
157 |     logger.info(f"[{ID}] ==> Created final report from this performance benchmark to file: {final_data_path}", True)
158 | 
159 |     # TODO: returning the final output data's filepath for now
160 |     return final_data_path
161 | 
162 | if __name__ == "__main__":
163 |     args = parser.parse_args()
164 | 
165 |     loops = int(args.loops)
166 |     if loops < 1:
167 |         raise Exception(f"loops MOST be greater then or equal to 1!")
168 | 
169 |     # single benchmark run
170 |     if loops <= 1:
171 |         start_time = time.time()
172 |         main(name=args.name, config_path=args.config)
173 |         runtime = time.time() - start_time
174 |         logger.info(f"(single) Total Runtime: {runtime} seconds", True)
175 |         sys.exit(0)
176 | 
177 |     # multiple benchmark runs
178 |     start_time = time.time()
179 |     all_filepaths = []
180 |     for i in range(int(args.loops)):
181 |         i_name = f"run_{i}"
182 |         if args.name != None:
183 |             i_name = f"{args.name}_{i_name}"
184 |         logger.info(f"Run {i+1}/{args.loops} for performance benchmark", True)
185 |         filepath = main(name=i_name, config_path=args.config)
186 |         all_filepaths.append(filepath)
187 |     logger.info(f"==> Muli-Run completed for performance benchmark. A total of {args.loops} runs we done and the following data was exported: {all_filepaths}", True)
188 |     runtime = time.time() - start_time
189 |     logger.info(f"(multiple) Total Runtime: {runtime} seconds", True)
190 |     
191 |     sys.exit(0)
192 | 


--------------------------------------------------------------------------------
/src/hf.py:
--------------------------------------------------------------------------------
  1 | from transformers import AutoTokenizer, AutoModelForCausalLM
  2 | from huggingface_hub import snapshot_download
  3 | from huggingface_hub import HfApi
  4 | import torch
  5 | import time
  6 | import os
  7 | 
  8 | import logger
  9 | 
 10 | def count_tokens(model_name, text):
 11 |     tokenizer = AutoTokenizer.from_pretrained(model_name)
 12 |     encoded_input = tokenizer(text)
 13 |     num_tokens = len(encoded_input['input_ids'])
 14 |     return num_tokens
 15 | 
 16 | def validate_options(options, valid_keys):
 17 |     user_keys = set(options.keys())
 18 |     return user_keys.issubset(valid_keys)
 19 | 
 20 | def str_to_torch_dtype(dtype_str):
 21 |     dtype_map = {
 22 |         'float32': torch.float32,
 23 |         'float': torch.float,
 24 |         'float64': torch.float64,
 25 |         'double': torch.double,
 26 |         'float16': torch.float16,
 27 |         'half': torch.half,
 28 |         'bfloat16': torch.bfloat16,
 29 |         'int8': torch.int8,
 30 |         'uint8': torch.uint8,
 31 |         'int16': torch.int16,
 32 |         'short': torch.short,
 33 |         'int32': torch.int32,
 34 |         'int': torch.int,
 35 |         'int64': torch.int64,
 36 |         'long': torch.long,
 37 |         'bool': torch.bool,
 38 |         'complex64': torch.complex64,
 39 |         'complex128': torch.complex128,
 40 |         'cdouble': torch.cdouble,
 41 |         'quint8': torch.quint8,
 42 |         'qint8': torch.qint8,
 43 |         'qint32': torch.qint32,
 44 |         'quint4x2': torch.quint4x2
 45 |     }
 46 |     return dtype_map.get(dtype_str, None)
 47 | 
 48 | def run_llm(model_name, input, device="", dtype="bfloat16", model_params={}):
 49 |     valid_model_params = {"max_length", "temperature", "top_k", "top_p", "num_return_sequences"}
 50 |     if model_params != {} and validate_options(model_params, valid_model_params) == False:
 51 |         raise Exception(f"model_params only accepts the following keys: {model_params.keys()}")
 52 |     
 53 |     dtype_torch = str_to_torch_dtype(dtype)
 54 |     if dtype_torch == None:
 55 |         raise Exception(f"{dtype} is NOT a valid dtype supported by Pytorch")
 56 |     
 57 |     # TODO: currently this function only supports one GPU, the goal will be to update this to support muliple GPU(s)
 58 |     if "cuda:" not in device and device != "cpu" and device != "":
 59 |         raise Exception(f"device can only be type cuda:N, cpu, or auto")
 60 |     
 61 |     """
 62 |     November 21, 2023
 63 |     Default model_params for generate() function
 64 |     https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig
 65 |     """
 66 |     default_model_params = {
 67 |         "max_length": 20,
 68 |         "temperature": 1.0,
 69 |         "top_k": 50,
 70 |         "top_p": 1.0,
 71 |         "num_return_sequences": 1
 72 |     }
 73 |     
 74 |     if device == "":
 75 |         device = "cuda:0" if torch.cuda.is_available() else "cpu"
 76 |     
 77 |     tokenizer = AutoTokenizer.from_pretrained(model_name)
 78 |     model = AutoModelForCausalLM.from_pretrained(model_name)
 79 |     model = model.to(device).to(dtype_torch)
 80 | 
 81 |     model_dtype = next(model.parameters()).dtype
 82 | 
 83 |     inputs = tokenizer(str(input), return_tensors="pt").to(device)
 84 | 
 85 |     for key in model_params:
 86 |         if model_params[key] != None:
 87 |             default_model_params[key] = model_params[key]
 88 | 
 89 |     start_time = time.time()
 90 | 
 91 |     generated_sequences = model.generate(
 92 |         inputs["input_ids"],
 93 |         max_length=default_model_params["max_length"],
 94 |         temperature=default_model_params["temperature"],
 95 |         top_k=default_model_params["top_k"],
 96 |         top_p=default_model_params["top_p"],
 97 |         num_return_sequences=default_model_params["num_return_sequences"]
 98 |     )
 99 | 
100 |     runtime = time.time() - start_time
101 | 
102 |     response = tokenizer.decode(generated_sequences[0])
103 |     device = generated_sequences.device
104 | 
105 |     tokens_in = inputs["input_ids"].size(1) * default_model_params["num_return_sequences"]
106 |     tokens_out = generated_sequences.size(1) * default_model_params["num_return_sequences"]
107 | 
108 |     return {
109 |         "model_name": model_name,
110 |         "runtime_secs": runtime,
111 |         "prompt": input,
112 |         "response": response,
113 |         "tokens": {
114 |             "input": tokens_in,
115 |             "output": tokens_out
116 |         },
117 |         "tokens_out/sec": tokens_out / runtime,
118 |         "device": str(generated_sequences.device),
119 |         "model_params": {
120 |             "dtype": str(model_dtype),
121 |             "max_length": default_model_params["max_length"],
122 |             "temperature": default_model_params["temperature"],
123 |             "top_k": default_model_params["top_k"],
124 |             "top_p": default_model_params["top_p"],
125 |             "num_return_sequences": default_model_params["num_return_sequences"]
126 |         }
127 |     }
128 | 
129 | # Check if hf model exists and download hf model to local disk if needed
130 | def get_hf_model(hf_repo_path):
131 |     # quick & dirty way to check if a hf model/repo exists
132 |     api = HfApi()
133 |     try:
134 |         api.list_repo_files(hf_repo_path)
135 |     except Exception as err:
136 |         logger.error(f"failed to check if model {hf_repo_path} exists on HuggingFace due to error: {err}")
137 |         return False
138 |     
139 |     # quick & dirty way to check if hf model/repo has been downloaded, saved to cache directory
140 |     cache_dir = os.path.join(os.path.expanduser('~'), ".cache/huggingface/hub/")
141 |     model_cache_dir = os.path.join(cache_dir, "models--{}".format(hf_repo_path.replace("/", "--")))
142 |     if os.path.isdir(model_cache_dir):
143 |         logger.info(f"model {hf_repo_path} already exists in directory {model_cache_dir}")
144 |         return True
145 |     
146 |     # download hf model/repo if it's not downloaded
147 |     try:
148 |         snapshot_download(repo_id=hf_repo_path, repo_type="model", token=True)
149 |         logger.info(f"downloaded model {hf_repo_path}")
150 |         return True
151 |     except Exception as err:
152 |         logger.error(f"{err}")
153 |         return False
154 | 


--------------------------------------------------------------------------------
/src/hw.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Function for getting hardware information/metrics in python
  3 | 
  4 | Massive credit goes to Abdeladim Fadheli's amazing article "How to Get Hardware and System Information in Python"
  5 | The following functions where based off of Fadheli's article:
  6 |     get_size()
  7 |     system_info()
  8 |     cpu_info()
  9 |     memory_info()
 10 |     disk_info()
 11 |     gpu_info()
 12 | 
 13 | https://thepythoncode.com/article/get-hardware-system-information-python
 14 | 
 15 | November 20, 2023
 16 | """
 17 | 
 18 | import subprocess
 19 | import psutil
 20 | import platform
 21 | import GPUtil
 22 | import json
 23 | 
 24 | def execute(cmd):
 25 |     proc = subprocess.Popen(str(cmd), shell=True, stdout=subprocess.PIPE,)
 26 |     output = proc.communicate()[0].decode("utf-8")
 27 |     return output.split("\n")
 28 | 
 29 | def neofetch():
 30 |     # requires neofetch (https://github.com/dylanaraps/neofetch)
 31 |     cmd = "neofetch --stdout"
 32 |     output = {}
 33 |     try:
 34 |         stdout = execute(cmd)
 35 |         for line in stdout:
 36 |             if ": " not in line:
 37 |                 continue
 38 |             tmp = line.split(": ", 1)
 39 |             if len(tmp) != 2:
 40 |                 continue
 41 |             key = tmp[0].lower().strip().replace(" ", "")
 42 |             value = tmp[1].strip()
 43 |             output[key] = value
 44 |     except:
 45 |         pass
 46 |     return output
 47 | 
 48 | def get_size(bytes, suffix="B"):
 49 |     """
 50 |     Scale bytes to its proper format
 51 |     e.g:
 52 |         1253656 => '1.20MB'
 53 |         1253656678 => '1.17GB'
 54 |     """
 55 |     factor = 1024
 56 |     for unit in ["", "K", "M", "G", "T", "P"]:
 57 |         if bytes < factor:
 58 |             return f"{bytes:.2f}{unit}{suffix}"
 59 |         bytes /= factor
 60 | 
 61 | def system_info(native=True):
 62 |     if native == False:
 63 |         return neofetch()
 64 | 
 65 |     uname = platform.uname()
 66 |     return {
 67 |         "system": uname.system,
 68 |         "node_name": uname.node,
 69 |         "release": uname.release,
 70 |         "version": uname.version,
 71 |         "machine": uname.machine,
 72 |         "processor": uname.processor
 73 |     }
 74 | 
 75 | def cpu_info(options={}):
 76 |     # model_name = options.get("model")
 77 |     # if model_name == None:
 78 |     #     model_name = neofetch()["cpu"]
 79 |     
 80 |     cores = None
 81 |     if options.get("cores") == True:
 82 |         cores = {}
 83 |         for i, percentage in enumerate(psutil.cpu_percent(percpu=True, interval=1)):
 84 |             cores[str(i)] = f"{percentage}%"
 85 |     
 86 |     cpufreq = psutil.cpu_freq()
 87 | 
 88 |     output = {
 89 |         # "model": str(model_name),
 90 |         "physical_cores": psutil.cpu_count(logical=False),
 91 |         "total_cores": psutil.cpu_count(logical=True),
 92 |         "max_frequency": f"{cpufreq.max:.2f}Mhz",
 93 |         "min_frequency": f"{cpufreq.min:.2f}Mhz",
 94 |         "current_frequency": f"{cpufreq.current:.2f}Mhz"
 95 |     }
 96 | 
 97 |     if cores != None:
 98 |         output["cores"] = cores
 99 | 
100 |     return output
101 | 
102 | def memory_info():
103 |     svmem = psutil.virtual_memory()
104 |     swap = psutil.swap_memory()
105 |     return {
106 |         "total": get_size(svmem.total),
107 |         "available": get_size(svmem.available),
108 |         "used": get_size(svmem.used),
109 |         "percentage": svmem.percent,
110 |         "swap": {
111 |             "total": get_size(swap.total),
112 |             "free": get_size(swap.free),
113 |             "used": get_size(swap.used),
114 |             "usage": swap.percent
115 |         }
116 |     }
117 | 
118 | def disk_info():
119 |     output = {}
120 |     partitions = psutil.disk_partitions()
121 |     for partition in partitions:
122 |         try:
123 |             partition_usage = psutil.disk_usage(partition.mountpoint)
124 |         except PermissionError:
125 |             partition_usage = ""
126 | 
127 |         output["partition.device"] = {
128 |             "mount_point": partition.mountpoint,
129 |             "file_system_type": partition.fstype
130 |         }
131 | 
132 |         if partition_usage != "":
133 |             output["partition.device"] = {**output["partition.device"], **{
134 |                 "total_size": get_size(partition_usage.total),
135 |                 "used": get_size(partition_usage.used),
136 |                 "free": get_size(partition_usage.free),
137 |                 "usage": f"{partition_usage.percent}%"
138 |             }}
139 |     
140 |     disk_io = psutil.disk_io_counters()
141 |     return {
142 |         "total_read": get_size(disk_io.read_bytes),
143 |         "total_write": get_size(disk_io.write_bytes),
144 |         "partitions": output
145 |     }
146 | 
147 | def gpu_info(raw=False):
148 |     gpus = GPUtil.getGPUs()
149 |     gpus_data = {}
150 |     for gpu in gpus:
151 |         if raw:
152 |             gpus_data[gpu.id] = vars(gpu)
153 |         else:
154 |             gpus_data[gpu.id] = {
155 |                 "id": gpu.id,
156 |                 "uuid": gpu.uuid,
157 |                 "name": gpu.name,
158 |                 "driver": gpu.driver,
159 |                 "load": f"{gpu.load*100}%",
160 |                 "memory": {
161 |                     "free": f"{gpu.memoryFree}MB",
162 |                     "used": f"{gpu.memoryUsed}MB",
163 |                     "total": f"{gpu.memoryTotal}MB"
164 |                 },
165 |                 "temp": f"{gpu.temperature} °C"
166 |             }
167 |     return gpus_data
168 | 
169 | def get_all(static_only=False):
170 |     if static_only:
171 |         return {
172 |             "system": system_info(),
173 |             "neofetch": system_info(False)
174 |         }
175 |     
176 |     return {
177 |         "cpu": cpu_info({"cores": True}),
178 |         "ram": memory_info(),
179 |         "disk": disk_info(),
180 |         "gpu": gpu_info()
181 |     }
182 | 


--------------------------------------------------------------------------------
/src/llmvm.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoTokenizer
 2 | from llm_vm.client import Client
 3 | import torch
 4 | import time
 5 | import sys
 6 | import os
 7 | import subprocess
 8 | 
 9 | import logger
10 | 
11 | ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir))
12 | sys.path.extend([ROOT_DIR, os.path.join(ROOT_DIR, "src")])
13 | import util
14 | 
15 | """
16 | ABOUT:
17 |     This function contains the official logic used by LLM-VM to pick a device (GPUs or CPUs)
18 | NOTES:
19 |     Update this function as need be, LLM-VM is constantly changing
20 | LAST-DATE:
21 |     November 15, 2023
22 | SOURCE:
23 |     https://github.com/anarchy-ai/LLM-VM/blob/main/src/llm_vm/onsite_llm.py
24 |     ~lines 45 - 49
25 | """
26 | def llmvm_device_picker():
27 |     device = None
28 |     if torch.cuda.device_count() > 1:
29 |         device = [f"cuda:{i}" for i in range(torch.cuda.device_count())]  # List of available GPUs
30 |     else:  # If only one GPU is available, use cuda:0, else use CPU
31 |         device = "cuda:0" if torch.cuda.is_available() else "cpu"
32 |     return device
33 | 
34 | def count_tokens(model_name, text):
35 |     tokenizer = AutoTokenizer.from_pretrained(model_name)
36 |     encoded_input = tokenizer(text)
37 |     num_tokens = len(encoded_input['input_ids'])
38 |     return num_tokens
39 | 
40 | def run_llm(model_name, prompt, supported_models, model_params={}):
41 |     if model_name not in supported_models:
42 |         raise Exception("model {} is NOT supported in LLM-VM".format(model_name))
43 |     if type(supported_models[model_name]) != str:
44 |         raise Exception("model {} is a close-sourced, API based, model".format(model_name))
45 |     if type(prompt) != str or len(prompt) == 0:
46 |         raise Exception("prompt MOST be type str and have a length greater then 0")
47 |     
48 |     """
49 |     ABOUT:
50 |         This is the default value for temperature in LLM-VM at the moment
51 |     LAST-DATE:
52 |         November 24, 2023
53 |     SOURCE:
54 |         https://github.com/anarchy-ai/LLM-VM/blob/main/src/llm_vm/client.py
55 |         ~lines 109
56 |     """
57 |     temp = model_params.get("temperature")
58 |     if temp == None:
59 |         temp = 0
60 |     # TODO: remove this log when a solution is implmented
61 |     logger.warning(f"currently, {run_llm.__name__}() only supports LLM-VM(s): temperature", True)
62 | 
63 |     client = Client(big_model=str(model_name))
64 |     
65 |     start_time = time.time()
66 |     
67 |     # NOTE: we only accept temperatuer for now
68 |     response=client.complete(prompt=prompt, temperature=temp)
69 |     
70 |     runtime = time.time() - start_time
71 | 
72 |     device = llmvm_device_picker()
73 |     huggingface_path = supported_models[model_name]
74 |     tokens_in = count_tokens(huggingface_path, prompt)
75 |     tokens_out = count_tokens(huggingface_path, response["completion"])
76 | 
77 |     return {
78 |         "model_name": model_name,
79 |         "model_path": huggingface_path,
80 |         "runtime_secs": runtime,
81 |         "prompt": prompt,
82 |         "response": response,
83 |         "tokens": {
84 |             "input": tokens_in,
85 |             "output": tokens_out
86 |         },
87 |         "tokens_out/sec": tokens_out / runtime,
88 |         "device": str(device),
89 |         "model_params": {
90 |             "temperature": temp
91 |         }
92 |     }
93 | 
94 | 


--------------------------------------------------------------------------------
/src/logger.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timezone
 2 | import logging
 3 | import time
 4 | import os
 5 | 
 6 | colors = {
 7 |     'black': '\033[30m',
 8 |     'red': '\033[31m',
 9 |     'green': '\033[32m',
10 |     'yellow': '\033[33m',
11 |     'blue': '\033[34m',
12 |     'magenta': '\033[35m',
13 |     'cyan': '\033[36m',
14 |     'white': '\033[37m',
15 |     'bright_black': '\033[90m',
16 |     'bright_red': '\033[91m',
17 |     'bright_green': '\033[92m',
18 |     'bright_yellow': '\033[93m',
19 |     'bright_blue': '\033[94m',
20 |     'bright_magenta': '\033[95m',
21 |     'bright_cyan': '\033[96m',
22 |     'bright_white': '\033[97m',
23 |     'reset': '\033[0m',
24 | }
25 | 
26 | # determine and set the path for the log file
27 | filepath = os.path.dirname(os.path.abspath(__file__))
28 | filepath = "/".join(filepath.split("/")[:-1])
29 | filepath = os.path.join(filepath, "events.log")
30 | 
31 | logging.basicConfig(level=logging.DEBUG,
32 |                     format='%(asctime)s %(filename)s:%(funcName)s:%(lineno)d [%(levelname)s] %(message)s',
33 |                     datefmt='%m/%d/%Y %H:%M:%S %z',
34 |                     filename=filepath,
35 |                     filemode='a')
36 | 
37 | logging.Formatter.converter = time.gmtime
38 | 
39 | def logger_printer(msg, sev=None):
40 |     sev = str(sev).lower()
41 | 
42 |     timestamp = datetime.now(timezone.utc).strftime("%m-%d-%Y %H:%M:%S.%f UTC")
43 |     timestamp = f"{timestamp} ({datetime.utcnow().timestamp()})"
44 | 
45 |     sev_val = ""
46 |     color_val = ""
47 |     end_val = colors.get("reset", "")
48 |     if sev == "info" or sev == "1":
49 |         color_val = colors.get("green", "")
50 |         sev_val = "INFO"
51 |     elif sev == "warning" or sev == "2":
52 |         color_val = colors.get("yellow", "")
53 |         sev_val = "WARNING"
54 |     elif sev == "error" or sev == "3":
55 |         color_val = colors.get("magenta", "")
56 |         sev_val = "ERROR"
57 |     elif sev == "critical" or sev == "4":
58 |         color_val = colors.get("red", "")
59 |         sev_val = "CRITICAL"
60 |     else:
61 |         color_val = None
62 |         sev_val = "NOTSET"
63 | 
64 |     if color_val == None:
65 |         color_val = ""
66 |         end_val = ""
67 |      
68 |     print(f"{color_val}{timestamp} [{sev_val}] - {msg}{end_val}" )
69 | 
70 | def info(msg, print_it=False):
71 |     logging.info(msg)
72 |     if print_it:
73 |         logger_printer(msg, 1)
74 | 
75 | def warning(msg, print_it=False):
76 |     logging.warning(msg)
77 |     if print_it:
78 |         logger_printer(msg, 2)
79 | 
80 | def error(msg, print_it=False):
81 |     logging.error(msg)
82 |     if print_it:
83 |         logger_printer(msg, 3)
84 | 
85 | def critical(msg, print_it=False):
86 |     logging.critical(msg)
87 |     if print_it:
88 |         logger_printer(msg, 4)
89 | 
90 | 


--------------------------------------------------------------------------------
/src/pointer.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import util
 4 | 
 5 | ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir))
 6 | 
 7 | SUPPORTED_FRAMEWORKS = [
 8 |     "huggingface",
 9 |     "llm-vm"
10 | ]
11 | 
12 | # NOTE: are of 11-24-2023 these are the current models supported in LLM-VM
13 | llmvm_supported_models = {
14 |     "pythia": "EleutherAI/pythia-70m-deduped",
15 |     "opt": "facebook/opt-350m",
16 |     "bloom": "bigscience/bloom-560m",
17 |     "neo": "EleutherAI/gpt-neo-1.3B",
18 |     "smallorca": "Open-Orca/LlongOrca-7B-16k",
19 |     "orca": "Open-Orca/LlongOrca-13B-16k",
20 |     "mistral": "Open-Orca/Mistral-7B-OpenOrca",
21 |     "platypus": "Open-Orca/OpenOrca-Platypus2-13B",
22 |     "llama": "openlm-research/open_llama_3b_v2",
23 |     "llama2": "meta-llama/Llama-2-7b-hf",
24 |     "codellama-7b": "codellama/CodeLlama-7b-hf",
25 |     "codellama-13b": "codellama/CodeLlama-13b-hf",
26 |     "codellama-34b": "codellama/CodeLlama-34b-hf",
27 |     "flan": "google/flan-t5-small",
28 |     "bert": None,
29 |     "gpt": None,
30 |     "gpt4": None,
31 |     "chat_gpt": None,
32 |     "quantized-llama2-7b-base": "TheBloke/Llama-2-7B-GGML",
33 |     "quantized-llama2-13b-base": "TheBloke/Llama-2-13B-GGML",
34 |     "llama2-7b-chat-Q4": "TheBloke/Llama-2-7B-Chat-GGML",
35 |     "llama2-7b-chat-Q6": "TheBloke/Llama-2-7B-Chat-GGML",
36 |     "llama2-13b-chat-Q4": "TheBloke/Llama-2-13B-Chat-GGML",
37 |     "llama2-13b-chat-Q6": "TheBloke/Llama-2-13B-Chat-GGML",
38 |     "llama2-7b-32k-Q4": "TheBloke/Llama-2-7B-32K-Instruct-GGML"
39 | }
40 | 
41 | def execute_llm(framework, model_name=None, prompt=None, device=None, dtype=None, model_params={}): 
42 |     if framework not in SUPPORTED_FRAMEWORKS:
43 |         raise Exception(f"framework {framework} is not supported!")
44 |     
45 |     if framework == "huggingface":
46 |         import hf
47 |         return hf.run_llm(model_name=model_name, input=prompt, device=device, dtype=dtype, model_params=model_params)
48 |     
49 |     if framework == "llm-vm":
50 |         import llmvm
51 |         return llmvm.run_llm(model_name=model_name, prompt=prompt, model_params={}, supported_models=llmvm_supported_models)
52 |     
53 |     raise Exception(f"the logic is off in {execute_llm.__name__}()")
54 | 


--------------------------------------------------------------------------------
/src/util.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | import json
 3 | import os
 4 | 
 5 | import logger
 6 | 
 7 | def execute(cmd):
 8 |     proc = subprocess.Popen(str(cmd), shell=True, stdout=subprocess.PIPE,)
 9 |     output = proc.communicate()[0].decode("utf-8")
10 |     return output.split("\n")
11 | 
12 | def read_file(path):
13 |     with open(str(path)) as file:
14 |         content = file.readlines()
15 |     content = [i.strip() for i in content]
16 |     return content
17 | 
18 | def write_file(path, data):
19 |     file = open(str(path), "w")
20 |     for line in data:
21 |         file.write(str(line) + "\n")
22 |     file.close()
23 | 
24 | def read_json(path):
25 |     with open(str(path)) as file:
26 |         content = json.load(file)
27 |     return content
28 | 
29 | def write_json(path, data):
30 |     with open(str(path), "w") as file:
31 |         # NOTE: ensure_ascii will write unicode characters as they are
32 |         json.dump(data, file, indent=4, ensure_ascii=False)
33 | 
34 | def create_file(path):
35 |     file = open(str(path), "a+")
36 |     file.close()
37 | 
38 | def get_id_files(id, dir_path):
39 |     if os.path.exists(dir_path) == False or os.path.isdir(dir_path) == False:
40 |         raise Exception(f"dir path {dir_path} does not exist!")
41 |     files = []
42 |     for f in os.listdir(dir_path):
43 |         full_path = os.path.join(dir_path, f)
44 |         if os.path.isfile(full_path) and (str(id) in f) and (".json" in full_path):
45 |             files.append(full_path)
46 |     return files
47 | 
48 | def delete_file(file_path):
49 |     if os.path.isfile(file_path):
50 |         try:
51 |             os.remove(file_path)
52 |             logger.info(f"deleted file {file_path}")
53 |         except Exception as err:
54 |             logger.error(f"error! failed to delete file {file_path}")
55 |     else:
56 |         logger.info(f"file {file_path} does not exist")
57 | 
58 | def get_current_commit():
59 |     try:
60 |         cmd = "git log | head -n 1 | awk '{print $2}'"
61 |         output = execute(cmd)
62 |         return output[0]
63 |     except:
64 |         return ""
65 | 


--------------------------------------------------------------------------------
/tools/columns.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import fnmatch
 4 | import csv
 5 | import time
 6 | 
 7 | def read_json(path):
 8 |     with open(str(path)) as file:
 9 |         content = json.load(file)
10 |     return content
11 | 
12 | def find_json_files(directory, deep_search=False):
13 |     if deep_search == False:
14 |         matches = []
15 |         for file in os.listdir(directory):
16 |             if file.endswith('.json'):
17 |                 matches.append(os.path.join(directory, file))
18 |         return matches
19 | 
20 |     matches = []
21 |     for root, dirnames, filenames in os.walk(directory):
22 |         for filename in fnmatch.filter(filenames, '*.json'):
23 |             matches.append(os.path.join(root, filename))
24 |     return matches
25 | 
26 | # https://stackoverflow.com/a/34311071
27 | def save_csv(filepath, data):
28 |     filepath = str(filepath)
29 |     if filepath.endswith(".csv") == False:
30 |         raise Exception(f"filepath MOST end with .csv")
31 |     with open(str(filepath), "w") as f:
32 |         wr = csv.DictWriter(f, delimiter="\t",fieldnames=list(data[0].keys()))
33 |         wr.writeheader()
34 |         wr.writerows(data)
35 | 
36 | # main function calls
37 | 
38 | reports_path = os.path.join(os.path.abspath(os.path.join(os.getcwd(), os.pardir)), "reports")
39 | 
40 | json_files_paths = find_json_files(reports_path)
41 | 
42 | output = []
43 | for path in json_files_paths:
44 |     print(f"Processing file: {path}")
45 | 
46 |     data = read_json(path)
47 | 
48 |     max_gpu_memory_usage = -1
49 |     min_gpu_memory_usage = 1000000000000000000000000
50 |     for key in list(data["metric"].keys()):
51 |         # TODO: this currently only assumes the RAM usage is in MB, this needs to be fixed
52 |         memory_usage = float(data["metric"][key]['gpu']['0']['memory']['used'].replace('MB', '')) / 1024
53 |         if memory_usage > max_gpu_memory_usage:
54 |             max_gpu_memory_usage = memory_usage
55 |         if memory_usage < min_gpu_memory_usage:
56 |             min_gpu_memory_usage = memory_usage
57 | 
58 |     tmp = {
59 |         **data["test_env"]["params"],
60 |         "runtime_sec": data["model"]["runtime_secs"],
61 |         "tokens/sec": data["model"]["tokens_out/sec"],
62 |         "tokens_in": data["model"]["tokens"]["input"],
63 |         "tokens_out": data["model"]["tokens"]["output"],
64 |         "gpu": data["metric"][list(data["metric"].keys())[0]]["gpu"]["0"]["name"],
65 |         "max_gpu_memory_usage": max_gpu_memory_usage,
66 |         "min_gpu_memory_usage": min_gpu_memory_usage,
67 |         "file": path.split("/")[-1]
68 |     }
69 | 
70 |     del tmp["prompt"]
71 |     del tmp["model_start_pause"]
72 |     del tmp["model_end_pause"]
73 | 
74 |     output.append(tmp)
75 | 
76 | csv_filename = f"results_{time.time()}.csv"
77 | save_csv(csv_filename, output)
78 | print(f"Created CSV file: {csv_filename}")
79 | 
80 | 


--------------------------------------------------------------------------------
/tools/graph.py:
--------------------------------------------------------------------------------
 1 | # NOTE: (11-24-2023) most of this code was generated by ChatGPT (GPT-4). It is a quick solution and needs refinement
 2 | 
 3 | import matplotlib.pyplot as plt
 4 | import matplotlib.dates as mdates
 5 | from datetime import datetime
 6 | import json
 7 | import os
 8 | import glob
 9 | 
10 | def pick_json_file(parent_directory):
11 |     # list all files in the parent directory that have a .json extension.
12 |     json_files = glob.glob(os.path.join(parent_directory, 'reports/*.json'))
13 |     json_files = [string for string in json_files if "report" in string]
14 |     
15 |     # display the JSON files.
16 |     print("Please select a JSON file to load:")
17 |     for idx, file_name in enumerate(json_files):
18 |         print(f"{idx}: {file_name}")
19 | 
20 |     # prompt the user to select a file by number.
21 |     selected_index = int(input("Enter the number of the JSON file you want to read: "))
22 |     selected_file_path = json_files[selected_index]
23 |     
24 |     # read the selected JSON file.
25 |     with open(selected_file_path) as file:
26 |         content = json.load(file)
27 |     
28 |     return content
29 | 
30 | # load data
31 | parent_directory = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
32 | data = pick_json_file(parent_directory)
33 | 
34 | # TODO: (11-24-2023) for now, we will focus on GPU metrics
35 | data = data["metric"]
36 | 
37 | timestamps = []
38 | memory_usages = []
39 | for epoch, info in data.items():
40 |     # convert epoch to datetime for plotting
41 |     timestamp = datetime.fromtimestamp(float(epoch))
42 |     # get GPU memory usage and convert it to GB
43 |     memory_usage = float(info['gpu']['0']['memory']['used'].replace('MB', '')) / 1024
44 | 
45 |     timestamps.append(timestamp)
46 |     memory_usages.append(memory_usage)
47 | 
48 | # plotting
49 | plt.figure(figsize=(10, 5))
50 | plt.plot(timestamps, memory_usages, marker='o')
51 | plt.xlabel('Epoch Time (seconds)')
52 | plt.ylabel('GPU Memory Usage (GB)')
53 | plt.title('GPU Memory Usage Over Time')
54 | plt.grid(True)
55 | plt.tight_layout()
56 | 
57 | plt.show()
58 | 


--------------------------------------------------------------------------------