├── docs ├── issues.md ├── news.md ├── performance.md └── pypi.md ├── tests ├── __init__.py └── test_get_prompt.py ├── llama2_wrapper ├── server │ ├── __init__.py │ ├── __main__.py │ └── app.py ├── download │ ├── __init__.py │ └── __main__.py ├── __init__.py ├── types.py └── model.py ├── static └── screenshot.png ├── .gitignore ├── requirements.txt ├── .env ├── env_examples ├── .env.13b_example ├── .env.7b_8bit_example ├── .env.7b_ggmlv3_q4_0_example └── .env.7b_gptq_example ├── .github └── workflows │ ├── release.yml │ └── branch.yml ├── LICENSE ├── pyproject.toml ├── prompts └── utils.py ├── CONTRIBUTING.md ├── colab ├── ggmlv3_q4_0.ipynb └── webui_CodeLlama_7B_Instruct_GPTQ.ipynb ├── benchmark.py ├── code_completion.py ├── app.py └── README.md /docs/issues.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llama2_wrapper/server/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llama2_wrapper/download/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llama2_wrapper/__init__.py: -------------------------------------------------------------------------------- 1 | from .model import LLAMA2_WRAPPER, get_prompt, get_prompt_for_dialog 2 | -------------------------------------------------------------------------------- /static/screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liltom-eth/llama2-webui/HEAD/static/screenshot.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | models 2 | dist 3 | 4 | .DS_Store 5 | .vscode 6 | 7 | __pycache__ 8 | gradio_cached_examples 9 | 10 | .pytest_cache -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | accelerate==0.21.0 2 | auto-gptq==0.3.0 3 | bitsandbytes==0.40.2 4 | gradio==3.37.0 5 | protobuf==3.20.3 6 | scipy==1.11.1 7 | sentencepiece==0.1.99 8 | torch==2.0.1 9 | transformers==4.31.0 10 | tqdm==4.65.0 11 | python-dotenv==1.0.0 12 | llama-cpp-python==0.2.11 13 | memory-profiler==0.61.0 14 | huggingface-hub==0.16.4 15 | fastapi==0.100.0 16 | uvicorn==0.23.1 17 | sse-starlette==1.6.5 18 | pydantic==2.2.1 19 | pydantic-settings==2.0.3 20 | pytest==7.4.0 21 | black==23.7.0 22 | -------------------------------------------------------------------------------- /.env: -------------------------------------------------------------------------------- 1 | MODEL_PATH = "" 2 | # if MODEL_PATH is "", default llama.cpp/gptq models 3 | # will be downloaded to: ./models 4 | 5 | # Example ggml path: 6 | # MODEL_PATH = "./models/llama-2-7b-chat.ggmlv3.q4_0.bin" 7 | 8 | # options: llama.cpp, gptq, transformers 9 | BACKEND_TYPE = "llama.cpp" 10 | 11 | # only for transformers bitsandbytes 8 bit 12 | LOAD_IN_8BIT = False 13 | 14 | MAX_MAX_NEW_TOKENS = 2048 15 | DEFAULT_MAX_NEW_TOKENS = 1024 16 | MAX_INPUT_TOKEN_LENGTH = 4000 17 | 18 | DEFAULT_SYSTEM_PROMPT = "" 19 | -------------------------------------------------------------------------------- /env_examples/.env.13b_example: -------------------------------------------------------------------------------- 1 | MODEL_PATH = "./models/Llama-2-13b-chat-hf" 2 | 3 | # options: llama.cpp, gptq, transformers 4 | BACKEND_TYPE = "transformers" 5 | 6 | # only for transformers bitsandbytes 8 bit 7 | LOAD_IN_8BIT = True 8 | 9 | MAX_MAX_NEW_TOKENS = 2048 10 | DEFAULT_MAX_NEW_TOKENS = 1024 11 | MAX_INPUT_TOKEN_LENGTH = 4000 12 | 13 | DEFAULT_SYSTEM_PROMPT = "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information." 14 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release 2 | on: 3 | release: 4 | types: 5 | - created 6 | 7 | jobs: 8 | publish: 9 | strategy: 10 | fail-fast: false 11 | matrix: 12 | python-version: ['3.10'] 13 | poetry-version: ['1.5.1'] 14 | os: [ubuntu-latest] 15 | runs-on: ${{ matrix.os }} 16 | steps: 17 | - uses: actions/checkout@v3 18 | - uses: actions/setup-python@v3 19 | with: 20 | python-version: ${{ matrix.python-version }} 21 | - name: Run image 22 | uses: abatilo/actions-poetry@v2.1.4 23 | with: 24 | poetry-version: ${{ matrix.poetry-version }} 25 | - name: Publish 26 | env: 27 | PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} 28 | run: | 29 | poetry config pypi-token.pypi $PYPI_TOKEN 30 | poetry publish --build 31 | -------------------------------------------------------------------------------- /env_examples/.env.7b_8bit_example: -------------------------------------------------------------------------------- 1 | MODEL_PATH = "./models/Llama-2-7b-chat-hf" 2 | 3 | # options: llama.cpp, gptq, transformers 4 | BACKEND_TYPE = "transformers" 5 | 6 | # only for transformers bitsandbytes 8 bit 7 | LOAD_IN_8BIT = True 8 | 9 | MAX_MAX_NEW_TOKENS = 2048 10 | DEFAULT_MAX_NEW_TOKENS = 1024 11 | MAX_INPUT_TOKEN_LENGTH = 4000 12 | 13 | DEFAULT_SYSTEM_PROMPT = "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information." 14 | -------------------------------------------------------------------------------- /env_examples/.env.7b_ggmlv3_q4_0_example: -------------------------------------------------------------------------------- 1 | MODEL_PATH = "" 2 | # if MODEL_PATH is "", default llama.cpp/gptq models 3 | # will be downloaded to: ./models 4 | 5 | # Example ggml path: 6 | # MODEL_PATH = "./models/llama-2-7b-chat.ggmlv3.q4_0.bin" 7 | 8 | # options: llama.cpp, gptq, transformers 9 | BACKEND_TYPE = "llama.cpp" 10 | 11 | # only for transformers bitsandbytes 8 bit 12 | LOAD_IN_8BIT = False 13 | 14 | MAX_MAX_NEW_TOKENS = 2048 15 | DEFAULT_MAX_NEW_TOKENS = 1024 16 | MAX_INPUT_TOKEN_LENGTH = 4000 17 | 18 | DEFAULT_SYSTEM_PROMPT = "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information." 19 | -------------------------------------------------------------------------------- /env_examples/.env.7b_gptq_example: -------------------------------------------------------------------------------- 1 | MODEL_PATH = "./models/Llama-2-7b-Chat-GPTQ" 2 | # if MODEL_PATH is "", default llama.cpp/gptq models 3 | # will be downloaded to: ./models 4 | 5 | # Example gptq path: 6 | # MODEL_PATH = "./models/Llama-2-7b-Chat-GPTQ" 7 | 8 | # options: llama.cpp, gptq, transformers 9 | BACKEND_TYPE = "gptq" 10 | 11 | # only for transformers bitsandbytes 8 bit 12 | LOAD_IN_8BIT = False 13 | 14 | MAX_MAX_NEW_TOKENS = 2048 15 | DEFAULT_MAX_NEW_TOKENS = 1024 16 | MAX_INPUT_TOKEN_LENGTH = 4000 17 | 18 | DEFAULT_SYSTEM_PROMPT = "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information." 19 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Tom 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "llama2-wrapper" 3 | version = "0.1.14" 4 | description = "Use llama2-wrapper as your local llama2 backend for Generative Agents / Apps" 5 | authors = ["liltom-eth "] 6 | license = "MIT" 7 | homepage = "https://github.com/liltom-eth/llama2-webui" 8 | repository = "https://github.com/liltom-eth/llama2-webui" 9 | readme = "./docs/pypi.md" 10 | 11 | packages = [{include = "llama2_wrapper"}] 12 | 13 | [tool.poetry.dependencies] 14 | python = ">=3.10,<3.13" 15 | accelerate = "^0.21.0" 16 | auto-gptq = "0.3.0" 17 | gradio = "3.37.0" 18 | protobuf = "3.20.3" 19 | scipy = "1.11.1" 20 | sentencepiece = "0.1.99" 21 | torch = "2.0.1" 22 | transformers = "4.31.0" 23 | tqdm = "4.65.0" 24 | python-dotenv = "1.0.0" 25 | llama-cpp-python = "0.2.11" 26 | bitsandbytes = [ 27 | {platform = 'linux', version = "0.40.2"}, 28 | {platform = 'darwin', version = "0.40.2"}, 29 | ] 30 | memory-profiler = "0.61.0" 31 | huggingface-hub = "0.16.4" 32 | fastapi = "0.100.0" 33 | uvicorn = "0.23.1" 34 | sse-starlette = "1.6.5" 35 | pydantic = "2.2.1" 36 | pydantic-settings = "2.0.3" 37 | pytest = "7.4.0" 38 | black = "23.7.0" 39 | 40 | 41 | [build-system] 42 | requires = ["poetry-core"] 43 | build-backend = "poetry.core.masonry.api" 44 | 45 | [virtualenvs] 46 | create = true 47 | in-project = true -------------------------------------------------------------------------------- /llama2_wrapper/server/__main__.py: -------------------------------------------------------------------------------- 1 | """Example FastAPI server for llama2_wrapper. 2 | 3 | To run this example: 4 | 5 | ``` 6 | python3 -m llama2_wrapper.server 7 | ``` 8 | 9 | or 10 | 11 | ``` 12 | uvicorn llama2_wrapper.server.app:app --reload 13 | ``` 14 | 15 | Then visit http://localhost:8000/docs to see the interactive API docs. 16 | 17 | """ 18 | import os 19 | import argparse 20 | 21 | import uvicorn 22 | 23 | from llama2_wrapper.server.app import create_app, Settings 24 | 25 | if __name__ == "__main__": 26 | parser = argparse.ArgumentParser() 27 | for name, field in Settings.model_fields.items(): 28 | description = field.description 29 | if field.default is not None and description is not None: 30 | description += f" (default: {field.default})" 31 | parser.add_argument( 32 | f"--{name}", 33 | dest=name, 34 | type=field.annotation if field.annotation is not None else str, 35 | help=description, 36 | ) 37 | 38 | args = parser.parse_args() 39 | settings = Settings(**{k: v for k, v in vars(args).items() if v is not None}) 40 | app = create_app(settings=settings) 41 | 42 | uvicorn.run( 43 | app, 44 | host=os.getenv("HOST", settings.host), 45 | port=int(os.getenv("PORT", settings.port)), 46 | ) 47 | -------------------------------------------------------------------------------- /prompts/utils.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import os 3 | from hashlib import md5 4 | 5 | 6 | def read_csv_to_dict_list(file_path): 7 | with open(file_path, mode="r", encoding="utf-8") as file: 8 | reader = csv.DictReader(file) 9 | list_of_dicts = [row for row in reader] 10 | return list_of_dicts 11 | 12 | 13 | def split_list_with_key(lst, dict_key): 14 | result = {} 15 | for row in lst: 16 | if row.get(dict_key) not in result: 17 | result[row.get(dict_key)] = [] 18 | result[row.get(dict_key)].append(row) 19 | return result 20 | 21 | 22 | def read_csv_to_type_dict(file_path, type_key): 23 | lst = read_csv_to_dict_list(file_path=file_path) 24 | return split_list_with_key(lst=lst, dict_key=type_key) 25 | 26 | 27 | def md5_str(str): 28 | return md5(str.encode("utf8")).hexdigest() 29 | 30 | 31 | current_dir = os.path.dirname(__file__) 32 | 33 | 34 | class PromtsContainer(object): 35 | def __init__(self) -> None: 36 | prompts_path = os.path.join(current_dir, "prompts_en.csv") 37 | self.data = read_csv_to_type_dict(prompts_path, "type") 38 | self.summary_dict = { 39 | md5_str(row.get("summary")): row.get("prompt") 40 | for chunk in self.data.values() 41 | for row in chunk 42 | } 43 | 44 | def get_prompts_tab_dict(self): 45 | return self.data 46 | 47 | def get_prompt_by_summary(self, summary): 48 | return self.summary_dict.get(md5_str(summary), summary) 49 | -------------------------------------------------------------------------------- /llama2_wrapper/download/__main__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | 4 | 5 | def main(): 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument( 8 | "--repo_id", 9 | type=str, 10 | default="", 11 | required=True, 12 | help="Repo ID like 'TheBloke/Llama-2-7B-Chat-GGML' ", 13 | ) 14 | parser.add_argument( 15 | "--filename", 16 | type=str, 17 | default=None, 18 | help="Filename like llama-2-7b-chat.ggmlv3.q4_0.bin", 19 | ) 20 | parser.add_argument( 21 | "--save_dir", type=str, default="./models", help="Directory to save models" 22 | ) 23 | 24 | args = parser.parse_args() 25 | 26 | repo_id = args.repo_id 27 | save_dir = args.save_dir 28 | 29 | if not os.path.exists(save_dir): 30 | os.makedirs(save_dir) 31 | 32 | if args.filename: 33 | filename = args.filename 34 | from huggingface_hub import hf_hub_download 35 | 36 | print(f"Start downloading model {repo_id} {filename} to: {save_dir}") 37 | 38 | hf_hub_download( 39 | repo_id=repo_id, 40 | filename=filename, 41 | local_dir=save_dir, 42 | ) 43 | else: 44 | repo_name = repo_id.split("/")[1] 45 | save_path = os.path.join(save_dir, repo_name) 46 | if not os.path.exists(save_path): 47 | os.makedirs(save_path) 48 | print(f"Start downloading model {repo_id} to: {save_path}") 49 | 50 | from huggingface_hub import snapshot_download 51 | 52 | snapshot_download( 53 | repo_id=repo_id, 54 | local_dir=save_path, 55 | ) 56 | 57 | 58 | if __name__ == "__main__": 59 | main() 60 | -------------------------------------------------------------------------------- /.github/workflows/branch.yml: -------------------------------------------------------------------------------- 1 | name: Push 2 | on: [push] 3 | 4 | jobs: 5 | test: 6 | strategy: 7 | fail-fast: false 8 | matrix: 9 | python-version: ['3.10'] 10 | poetry-version: ['1.5.1'] 11 | os: [ubuntu-latest] 12 | runs-on: ${{ matrix.os }} 13 | steps: 14 | - uses: actions/checkout@v3 15 | - uses: actions/setup-python@v3 16 | with: 17 | python-version: ${{ matrix.python-version }} 18 | - name: Run image 19 | uses: abatilo/actions-poetry@v2.1.4 20 | with: 21 | poetry-version: ${{ matrix.poetry-version }} 22 | - name: Install dependencies 23 | run: poetry install 24 | - name: Run tests 25 | run: poetry run pytest 26 | - name: Upload coverage reports to Codecov 27 | uses: codecov/codecov-action@v3 28 | env: 29 | CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} 30 | # - name: Upload coverage to Codecov 31 | # uses: codecov/codecov-action@v2 32 | code-quality: 33 | strategy: 34 | fail-fast: false 35 | matrix: 36 | python-version: ['3.10'] 37 | poetry-version: ['1.5.1'] 38 | os: [ubuntu-latest] 39 | runs-on: ${{ matrix.os }} 40 | steps: 41 | - uses: actions/checkout@v3 42 | - uses: actions/setup-python@v3 43 | with: 44 | python-version: ${{ matrix.python-version }} 45 | - name: Python Poetry Action 46 | uses: abatilo/actions-poetry@v2.1.6 47 | with: 48 | poetry-version: ${{ matrix.poetry-version }} 49 | - name: Install dependencies 50 | run: poetry install 51 | - name: Run black 52 | run: poetry run black . --check 53 | # - name: Run isort 54 | # run: poetry run isort . --check-only --profile black 55 | # - name: Run flake8 56 | # run: poetry run flake8 . 57 | # - name: Run bandit 58 | # run: poetry run bandit . 59 | # - name: Run saftey 60 | # run: poetry run safety check 61 | -------------------------------------------------------------------------------- /tests/test_get_prompt.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from llama2_wrapper.model import get_prompt_for_dialog 3 | 4 | 5 | class TestClassGetPromptForDialog: 6 | from llama2_wrapper.types import Message 7 | 8 | dialog = [] 9 | message1 = Message( 10 | role="system", 11 | content="You are a helpful, respectful and honest assistant. ", 12 | ) 13 | message2 = Message( 14 | role="user", 15 | content="Hi do you know Pytorch?", 16 | ) 17 | dialog.append(message1) 18 | dialog.append(message2) 19 | 20 | dialog2 = [] 21 | dialog2.append(message1) 22 | dialog2.append(message2) 23 | message3 = Message( 24 | role="assistant", 25 | content="Yes I know Pytorch. ", 26 | ) 27 | message4 = Message( 28 | role="user", 29 | content="Can you write a CNN in Pytorch?", 30 | ) 31 | dialog2.append(message3) 32 | dialog2.append(message4) 33 | 34 | dialog3 = [] 35 | dialog3.append(message3) 36 | dialog3.append(message4) 37 | dialog3.append(message3) 38 | dialog3.append(message4) 39 | message5 = Message( 40 | role="assistant", 41 | content="Yes I can write a CNN in Pytorch.", 42 | ) 43 | dialog3.append(message5) 44 | 45 | def test_dialog1(self): 46 | prompt = get_prompt_for_dialog(self.dialog) 47 | # print(prompt) 48 | result = """[INST] <>\nYou are a helpful, respectful and honest assistant. \n<>\n\nHi do you know Pytorch? [/INST]""" 49 | assert prompt == result 50 | 51 | def test_dialog2(self): 52 | prompt = get_prompt_for_dialog(self.dialog2) 53 | # print(prompt) 54 | result = """[INST] <>\nYou are a helpful, respectful and honest assistant. \n<>\n\nHi do you know Pytorch? [/INST] Yes I know Pytorch. [INST] Can you write a CNN in Pytorch? [/INST]""" 55 | assert prompt == result 56 | 57 | def test_dialog3(self): 58 | with pytest.raises(AssertionError): 59 | prompt = get_prompt_for_dialog(self.dialog3) 60 | -------------------------------------------------------------------------------- /docs/news.md: -------------------------------------------------------------------------------- 1 | # News 2 | - [2023/09] The newest `llama2-wrapper>=0.1.14` supports llama.cpp's `gguf` models. 3 | 4 | - [2023/08] 🔥 For developers, we offer a web server that acts as a drop-in replacement for the OpenAI API. 5 | 6 | - Usage: 7 | 8 | ``` 9 | python3 -m llama2_wrapper.server 10 | ``` 11 | 12 | 13 | 14 | - [2023/08] 🔥 For developers, we released `llama2-wrapper` as a llama2 backend wrapper in [PYPI](https://pypi.org/project/llama2-wrapper/). 15 | 16 | - Install: `pip install llama2-wrapper` 17 | 18 | - Usage: 19 | 20 | ```python 21 | from llama2_wrapper import LLAMA2_WRAPPER, get_prompt 22 | llama2_wrapper = LLAMA2_WRAPPER( 23 | model_path="./models/Llama-2-7B-Chat-GGML/llama-2-7b-chat.ggmlv3.q4_0.bin", 24 | backend_type="llama.cpp", #options: llama.cpp, transformers, gptq 25 | ) 26 | prompt = "Do you know Pytorch" 27 | llama2_promt = get_prompt(prompt) 28 | answer = llama2_wrapper(llama2_promt, temperature=0.9) 29 | ``` 30 | 31 | - [2023/08] 🔥 We added `benchmark.py` for users to benchmark llama2 models on their local devices. 32 | 33 | - Check/contribute the performance of your device in the full [performance doc](https://github.com/liltom-eth/llama2-webui/blob/main/docs/performance.md). 34 | 35 | - [2023/07] We released **[llama2-webui](https://github.com/liltom-eth/llama2-webui)**, a gradio web UI to run Llama 2 on GPU or CPU from anywhere (Linux/Windows/Mac). 36 | 37 | - Supporting models: [Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)/[13b](https://huggingface.co/llamaste/Llama-2-13b-chat-hf)/[70b](https://huggingface.co/llamaste/Llama-2-70b-chat-hf), all [Llama-2-GPTQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-GPTQ), all [Llama-2-GGML](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML) ... 38 | - Supporting model backends: [tranformers](https://github.com/huggingface/transformers), [bitsandbytes(8-bit inference)](https://github.com/TimDettmers/bitsandbytes), [AutoGPTQ(4-bit inference)](https://github.com/PanQiWei/AutoGPTQ), [llama.cpp](https://github.com/ggerganov/llama.cpp) -------------------------------------------------------------------------------- /docs/performance.md: -------------------------------------------------------------------------------- 1 | # Benchmark Performance 2 | 3 | ## Performance on Nvidia GPU 4 | 5 | | Model | Precision | Device | GPU VRAM | Speed (tokens/sec) | load time (s) | 6 | | --------------------------------- | --------- | ---------- | ---------------------- | ---------------- | ---------------- | 7 | | Llama-2-7b-chat-hf | 16 bit | | | | | 8 | | Llama-2-7b-chat-hf | 8bit | NVIDIA RTX 2080 Ti | 7.7 GB VRAM | 3.76 | 641.36 | 9 | | Llama-2-7b-Chat-GPTQ | 4bit | NVIDIA RTX 2080 Ti | 5.8 GB VRAM | 18.85 | 192.91 | 10 | | Llama-2-7b-Chat-GPTQ | 4bit | NVIDIA GTX 1660 Super | 4.8 GB VRAM | 8.5 | 262.74 | 11 | | Llama-2-7b-Chat-GPTQ | 4 bit | Google Colab T4 | 5.8 GB VRAM | 18.19 | 37.44 | 12 | | Llama-2-13b-chat-hf | 16 bit | | | | | 13 | | | | | | | | 14 | 15 | ## Performance on CPU / OpenBLAS / cuBLAS / CLBlast / Metal 16 | 17 | | Model | Precision | Device | RAM / GPU VRAM | Speed (tokens/sec) | load time (s) | 18 | | --------------------------------- | --------- | ---------- | ---------------------- | ---------------- | ---------------- | 19 | | llama-2-7b-chat.ggmlv3.q2_K | 2 bit | Intel i7-8700 | 4.5 GB RAM | 7.88 | 31.90 | 20 | | llama-2-7b-chat.ggmlv3.q2_K | 2 bit | Apple M2 CPU | 4.5 GB RAM | 11.10 | 0.10 | 21 | | llama-2-7b-chat.ggmlv3.q2_K | 2 bit | Apple M2 Metal | 4.5 GB RAM | 12.10 | 0.12 | 22 | | llama-2-7b-chat.ggmlv3.q4_0 | 4 bit | Intel i7-8700 | 5.4 GB RAM | 6.27 | 173.15 | 23 | | llama-2-7b-chat.ggmlv3.q4_0 | 4 bit | Intel i7-9700 | 4.8 GB RAM | 4.2 | 87.9 | 24 | | llama-2-7b-chat.ggmlv3.q4_0 | 4 bit | Apple M1 Pro CPU | 5.4 GB RAM | 17.90 | 0.18 | 25 | | llama-2-7b-chat.ggmlv3.q4_0 | 4 bit | Apple M2 CPU | 5.4 GB RAM | 13.70 | 0.13 | 26 | | llama-2-7b-chat.ggmlv3.q4_0 | 4 bit | Apple M2 Metal | 5.4 GB RAM | 12.60 | 0.10 | 27 | | llama-2-7b-chat.ggmlv3.q4_0 | 4 bit | AMD Ryzen 9 5900HS | 4.1 GB RAM | 6.01 | 0.15 | 28 | | llama-2-7b-chat.ggmlv3.q4_0 | 4 bit | Intel vServer 4 threads, eth services | 8 GB RAM | 1.31 | 0.5| 29 | | llama-2-7b-chat.ggmlv3.q8_0 | 8 bit | Intel i7-8700 | 8.6 GB RAM | 2.63 | 336.57 | 30 | | llama-2-7b-chat.ggmlv3.q8_0 | 8 bit | Intel i7-9700 | 7.6 GB RAM | 2.05 | 302.9 | 31 | | | | | | | | 32 | 33 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to [llama2-webui](https://github.com/liltom-eth/llama2-webui) 2 | 3 | We love your input! We want to make contributing to this project as easy and transparent as possible, whether it's: 4 | 5 | - Reporting a bug 6 | - Proposing new features 7 | - Discussing the current state of the code 8 | - Update README.md 9 | - Submitting a PR 10 | 11 | ## Using GitHub's [issues](https://github.com/liltom-eth/llama2-webui/issues) 12 | 13 | We use GitHub issues to track public bugs. Report a bug by [opening a new issue](https://github.com/liltom-eth/llama2-webui/issues). It's that easy! 14 | 15 | Thanks for **[jlb1504](https://github.com/jlb1504)** for reporting the [first issue](https://github.com/liltom-eth/llama2-webui/issues/1)! 16 | 17 | **Great Bug Reports** tend to have: 18 | 19 | - A quick summary and/or background 20 | - Steps to reproduce 21 | - Be specific! 22 | - Give a sample code if you can. 23 | - What you expected would happen 24 | - What actually happens 25 | - Notes (possibly including why you think this might be happening, or stuff you tried that didn't work) 26 | 27 | Proposing new features are also welcome. 28 | 29 | ## Pull Request 30 | 31 | All pull requests are welcome. For example, you update the `README.md` to help users to better understand the usage. 32 | 33 | ### Clone the repository 34 | 35 | 1. Create a user account on GitHub if you do not already have one. 36 | 37 | 2. Fork the project [repository](https://github.com/liltom-eth/llama2-webui): click on the *Fork* button near the top of the page. This creates a copy of the code under your account on GitHub. 38 | 39 | 3. Clone this copy to your local disk: 40 | 41 | ``` 42 | git clone git@github.com:liltom-eth/llama2-webui.git 43 | cd llama2-webui 44 | ``` 45 | 46 | ### Implement your changes 47 | 48 | 1. Create a branch to hold your changes: 49 | 50 | ``` 51 | git checkout -b my-feature 52 | ``` 53 | 54 | and start making changes. Never work on the main branch! 55 | 56 | 2. Start your work on this branch. 57 | 58 | 3. When you’re done editing, do: 59 | 60 | ``` 61 | git add 62 | git commit 63 | ``` 64 | 65 | to record your changes in [git](https://git-scm.com/). 66 | 67 | ### Submit your contribution 68 | 69 | 1. If everything works fine, push your local branch to the remote server with: 70 | 71 | ``` 72 | git push -u origin my-feature 73 | ``` 74 | 75 | 2. Go to the web page of your fork and click "Create pull request" to send your changes for review. 76 | 77 | ```{todo} 78 | Find more detailed information in [creating a PR]. You might also want to open 79 | the PR as a draft first and mark it as ready for review after the feedbacks 80 | from the continuous integration (CI) system or any required fixes. 81 | ``` 82 | 83 | ## License 84 | 85 | By contributing, you agree that your contributions will be licensed under its MIT License. 86 | 87 | ## Questions? 88 | 89 | Email us at [liltom.eth@gmail.com](mailto:liltom.eth@gmail.com) 90 | 91 | -------------------------------------------------------------------------------- /llama2_wrapper/types.py: -------------------------------------------------------------------------------- 1 | from typing import Any, List, Optional, Dict, Union 2 | from typing_extensions import TypedDict, NotRequired, Literal 3 | 4 | B_INST, E_INST = "[INST]", "[/INST]" 5 | B_SYS, E_SYS = "<>\n", "\n<>\n\n" 6 | 7 | 8 | # Role = Literal["system", "user", "assistant"] 9 | # class Message(TypedDict): 10 | # role: Role 11 | # content: str 12 | 13 | 14 | class ChatCompletionMessage(TypedDict): 15 | role: Literal["assistant", "user", "system"] 16 | content: str 17 | user: NotRequired[str] 18 | 19 | 20 | # transformers: Message; llama.cpp: ChatCompletionMessage 21 | Message = ChatCompletionMessage 22 | Dialog = List[Message] 23 | 24 | 25 | class EmbeddingUsage(TypedDict): 26 | prompt_tokens: int 27 | total_tokens: int 28 | 29 | 30 | class EmbeddingData(TypedDict): 31 | index: int 32 | object: str 33 | embedding: List[float] 34 | 35 | 36 | class Embedding(TypedDict): 37 | object: Literal["list"] 38 | model: str 39 | data: List[EmbeddingData] 40 | usage: EmbeddingUsage 41 | 42 | 43 | class CompletionLogprobs(TypedDict): 44 | text_offset: List[int] 45 | token_logprobs: List[Optional[float]] 46 | tokens: List[str] 47 | top_logprobs: List[Optional[Dict[str, float]]] 48 | 49 | 50 | class CompletionChoice(TypedDict): 51 | text: str 52 | index: int 53 | logprobs: Optional[CompletionLogprobs] 54 | finish_reason: Optional[str] 55 | 56 | 57 | class CompletionUsage(TypedDict): 58 | prompt_tokens: int 59 | completion_tokens: int 60 | total_tokens: int 61 | 62 | 63 | class CompletionChunk(TypedDict): 64 | id: str 65 | object: Literal["text_completion"] 66 | created: int 67 | model: str 68 | choices: List[CompletionChoice] 69 | 70 | 71 | class Completion(TypedDict): 72 | id: str 73 | object: Literal["text_completion"] 74 | created: int 75 | model: str 76 | choices: List[CompletionChoice] 77 | usage: CompletionUsage 78 | 79 | 80 | class ChatCompletionChoice(TypedDict): 81 | index: int 82 | message: ChatCompletionMessage 83 | finish_reason: Optional[str] 84 | 85 | 86 | class ChatCompletion(TypedDict): 87 | id: str 88 | object: Literal["chat.completion"] 89 | created: int 90 | model: str 91 | choices: List[ChatCompletionChoice] 92 | usage: CompletionUsage 93 | 94 | 95 | class ChatCompletionChunkDeltaEmpty(TypedDict): 96 | pass 97 | 98 | 99 | class ChatCompletionChunkDelta(TypedDict): 100 | role: NotRequired[Literal["assistant"]] 101 | content: NotRequired[str] 102 | 103 | 104 | class ChatCompletionChunkChoice(TypedDict): 105 | index: int 106 | delta: Union[ChatCompletionChunkDelta, ChatCompletionChunkDeltaEmpty] 107 | finish_reason: Optional[str] 108 | 109 | 110 | class ChatCompletionChunk(TypedDict): 111 | id: str 112 | model: str 113 | object: Literal["chat.completion.chunk"] 114 | created: int 115 | choices: List[ChatCompletionChunkChoice] 116 | -------------------------------------------------------------------------------- /colab/ggmlv3_q4_0.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [], 7 | "toc_visible": true, 8 | "authorship_tag": "ABX9TyM9WbudQYrVFksXUrt4Opt3", 9 | "include_colab_link": true 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | }, 15 | "language_info": { 16 | "name": "python" 17 | } 18 | }, 19 | "cells": [ 20 | { 21 | "cell_type": "markdown", 22 | "metadata": { 23 | "id": "view-in-github", 24 | "colab_type": "text" 25 | }, 26 | "source": [ 27 | "\"Open" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": { 34 | "id": "7O5JSosg5-rx" 35 | }, 36 | "outputs": [], 37 | "source": [ 38 | "%cd /content\n", 39 | "!pip install llama2-wrapper\n" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "source": [ 45 | "from llama2_wrapper import LLAMA2_WRAPPER, get_prompt\n", 46 | "\n", 47 | "llama2_wrapper = LLAMA2_WRAPPER()" 48 | ], 49 | "metadata": { 50 | "colab": { 51 | "base_uri": "https://localhost:8080/" 52 | }, 53 | "id": "8rgb1ckl72wC", 54 | "outputId": "d9ca2e20-26a5-490b-86f2-1a182e533b20" 55 | }, 56 | "execution_count": 5, 57 | "outputs": [ 58 | { 59 | "output_type": "stream", 60 | "name": "stdout", 61 | "text": [ 62 | "Running on backend llama.cpp.\n", 63 | "Use default model path: ./models/llama-2-7b-chat.ggmlv3.q4_0.bin\n", 64 | "Start downloading model to: ./models/llama-2-7b-chat.ggmlv3.q4_0.bin\n" 65 | ] 66 | } 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "source": [ 72 | "prompt = get_prompt(\"Hi do you know Pytorch?\")\n", 73 | "print(llama2_wrapper(prompt))" 74 | ], 75 | "metadata": { 76 | "id": "Qz2xAqozTIf6", 77 | "colab": { 78 | "base_uri": "https://localhost:8080/" 79 | }, 80 | "outputId": "1380fa52-3d4a-4ac5-ed02-7faefe7ec2f6" 81 | }, 82 | "execution_count": 3, 83 | "outputs": [ 84 | { 85 | "output_type": "stream", 86 | "name": "stdout", 87 | "text": [ 88 | " Yes, I'm familiar with PyTorch! PyTorch is an open-source deep learning framework that is widely used for building and training neural networks. It was originally developed by Facebook and is now maintained by the PyTorch Foundation.\n", 89 | "\n", 90 | "Here are some key features and capabilities of PyTorch:\n", 91 | "\n", 92 | "1. **Tensor Computation**: PyTorch provides a powerful tensor computation engine that allows for complex mathematical operations on large datasets.\n", 93 | "2. **Autograd**: PyTorch's autograd system automatically computes gradients, which can save a lot of time and effort during training.\n", 94 | "3. **Dynamic Compute**: PyTorch's dynamic compute system allows for more efficient computation by only computing the necessary computations at runtime.\n", 95 | "4. **Memory-efficient**: PyTorch is designed to be memory-efficient, which is important for training large models that require a lot of memory.\n", 96 | "5. **Accelerators**: PyTorch supports a wide range of accelerators, including GPUs, TPUs, and FPGAs, which can significantly speed up training times.\n", 97 | "6. **Modules**: PyTorch provides a wide range of pre-built modules for common tasks, such as convolutional layers, recurrent neural networks, and more.\n", 98 | "7. **Extensive Community**: PyTorch has a large and active community of developers and users, which can be helpful for getting support and staying up-to-date with the latest developments.\n", 99 | "8. **Easy Integration**: PyTorch can be easily integrated with other popular deep learning frameworks, such as TensorFlow and Keras.\n", 100 | "9. **Pythonic**: PyTorch is written in Python, which is a popular and easy-to-learn programming language.\n", 101 | "10. **Flexible**: PyTorch allows for a wide range of customization options, which can be useful for building and training unique models.\n", 102 | "\n", 103 | "Overall, PyTorch is a powerful and flexible deep learning framework that can be used for a wide range of applications, including computer vision, natural language processing, and more.\n" 104 | ] 105 | } 106 | ] 107 | } 108 | ] 109 | } -------------------------------------------------------------------------------- /benchmark.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import argparse 4 | 5 | from dotenv import load_dotenv 6 | from distutils.util import strtobool 7 | from memory_profiler import memory_usage 8 | from tqdm import tqdm 9 | 10 | from llama2_wrapper import LLAMA2_WRAPPER 11 | 12 | 13 | def run_iteration( 14 | llama2_wrapper, prompt_example, DEFAULT_SYSTEM_PROMPT, DEFAULT_MAX_NEW_TOKENS 15 | ): 16 | def generation(): 17 | generator = llama2_wrapper.run( 18 | prompt_example, 19 | [], 20 | DEFAULT_SYSTEM_PROMPT, 21 | DEFAULT_MAX_NEW_TOKENS, 22 | 1, 23 | 0.95, 24 | 50, 25 | ) 26 | model_response = None 27 | try: 28 | first_model_response = next(generator) 29 | except StopIteration: 30 | pass 31 | for model_response in generator: 32 | pass 33 | return llama2_wrapper.get_token_length(model_response), model_response 34 | 35 | tic = time.perf_counter() 36 | mem_usage, (output_token_length, model_response) = memory_usage( 37 | (generation,), max_usage=True, retval=True 38 | ) 39 | toc = time.perf_counter() 40 | 41 | generation_time = toc - tic 42 | tokens_per_second = output_token_length / generation_time 43 | 44 | return generation_time, tokens_per_second, mem_usage, model_response 45 | 46 | 47 | def main(): 48 | parser = argparse.ArgumentParser() 49 | parser.add_argument("--iter", type=int, default=5, help="Number of iterations") 50 | parser.add_argument("--model_path", type=str, default="", help="model path") 51 | parser.add_argument( 52 | "--backend_type", 53 | type=str, 54 | default="", 55 | help="Backend options: llama.cpp, gptq, transformers", 56 | ) 57 | parser.add_argument( 58 | "--load_in_8bit", 59 | type=bool, 60 | default=False, 61 | help="Whether to use bitsandbytes 8 bit.", 62 | ) 63 | 64 | args = parser.parse_args() 65 | 66 | load_dotenv() 67 | 68 | DEFAULT_SYSTEM_PROMPT = os.getenv("DEFAULT_SYSTEM_PROMPT", "") 69 | MAX_MAX_NEW_TOKENS = int(os.getenv("MAX_MAX_NEW_TOKENS", 2048)) 70 | DEFAULT_MAX_NEW_TOKENS = int(os.getenv("DEFAULT_MAX_NEW_TOKENS", 1024)) 71 | MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", 4000)) 72 | 73 | MODEL_PATH = os.getenv("MODEL_PATH") 74 | assert MODEL_PATH is not None, f"MODEL_PATH is required, got: {MODEL_PATH}" 75 | BACKEND_TYPE = os.getenv("BACKEND_TYPE") 76 | assert BACKEND_TYPE is not None, f"BACKEND_TYPE is required, got: {BACKEND_TYPE}" 77 | 78 | LOAD_IN_8BIT = bool(strtobool(os.getenv("LOAD_IN_8BIT", "True"))) 79 | 80 | if args.model_path != "": 81 | MODEL_PATH = args.model_path 82 | if args.backend_type != "": 83 | BACKEND_TYPE = args.backend_type 84 | if args.load_in_8bit: 85 | LOAD_IN_8BIT = True 86 | 87 | # Initialization 88 | init_tic = time.perf_counter() 89 | llama2_wrapper = LLAMA2_WRAPPER( 90 | model_path=MODEL_PATH, 91 | backend_type=BACKEND_TYPE, 92 | max_tokens=MAX_INPUT_TOKEN_LENGTH, 93 | load_in_8bit=LOAD_IN_8BIT, 94 | # verbose=True, 95 | ) 96 | 97 | init_toc = time.perf_counter() 98 | initialization_time = init_toc - init_tic 99 | 100 | total_time = 0 101 | total_tokens_per_second = 0 102 | total_memory_gen = 0 103 | 104 | prompt_example = ( 105 | "Can you explain briefly to me what is the Python programming language?" 106 | ) 107 | 108 | # Cold run 109 | print("Performing cold run...") 110 | run_iteration( 111 | llama2_wrapper, prompt_example, DEFAULT_SYSTEM_PROMPT, DEFAULT_MAX_NEW_TOKENS 112 | ) 113 | 114 | # Timed runs 115 | print(f"Performing {args.iter} timed runs...") 116 | for i in tqdm(range(args.iter)): 117 | try: 118 | gen_time, tokens_per_sec, mem_gen, model_response = run_iteration( 119 | llama2_wrapper, 120 | prompt_example, 121 | DEFAULT_SYSTEM_PROMPT, 122 | DEFAULT_MAX_NEW_TOKENS, 123 | ) 124 | total_time += gen_time 125 | total_tokens_per_second += tokens_per_sec 126 | total_memory_gen += mem_gen 127 | except: 128 | break 129 | avg_time = total_time / (i + 1) 130 | avg_tokens_per_second = total_tokens_per_second / (i + 1) 131 | avg_memory_gen = total_memory_gen / (i + 1) 132 | 133 | print(f"Last model response: {model_response}") 134 | print(f"Initialization time: {initialization_time:0.4f} seconds.") 135 | print( 136 | f"Average generation time over {(i + 1)} iterations: {avg_time:0.4f} seconds." 137 | ) 138 | print( 139 | f"Average speed over {(i + 1)} iterations: {avg_tokens_per_second:0.4f} tokens/sec." 140 | ) 141 | print(f"Average memory usage during generation: {avg_memory_gen:.2f} MiB") 142 | 143 | 144 | if __name__ == "__main__": 145 | main() 146 | -------------------------------------------------------------------------------- /docs/pypi.md: -------------------------------------------------------------------------------- 1 | # llama2-wrapper 2 | 3 | - Use [llama2-wrapper](https://pypi.org/project/llama2-wrapper/) as your local llama2 backend for Generative Agents/Apps, [colab example](https://github.com/liltom-eth/llama2-webui/blob/main/colab/Llama_2_7b_Chat_GPTQ.ipynb). 4 | 5 | - [Run OpenAI Compatible API](https://github.com/liltom-eth/llama2-webui#start-openai-compatible-api) on Llama2 models. 6 | 7 | ## Features 8 | 9 | - Supporting models: [Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)/[13b](https://huggingface.co/llamaste/Llama-2-13b-chat-hf)/[70b](https://huggingface.co/llamaste/Llama-2-70b-chat-hf), [Llama-2-GPTQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-GPTQ), [Llama-2-GGML](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML), [CodeLlama](https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GPTQ)... 10 | - Supporting model backends: [tranformers](https://github.com/huggingface/transformers), [bitsandbytes(8-bit inference)](https://github.com/TimDettmers/bitsandbytes), [AutoGPTQ(4-bit inference)](https://github.com/PanQiWei/AutoGPTQ), [llama.cpp](https://github.com/ggerganov/llama.cpp) 11 | - Demos: [Run Llama2 on MacBook Air](https://twitter.com/liltom_eth/status/1682791729207070720?s=20); [Run Llama2 on Colab T4 GPU](https://github.com/liltom-eth/llama2-webui/blob/main/colab/Llama_2_7b_Chat_GPTQ.ipynb) 12 | - Use [llama2-wrapper](https://pypi.org/project/llama2-wrapper/) as your local llama2 backend for Generative Agents/Apps; [colab example](./colab/Llama_2_7b_Chat_GPTQ.ipynb). 13 | - [Run OpenAI Compatible API](https://github.com/liltom-eth/llama2-webui#start-openai-compatible-api) on Llama2 models. 14 | - [News](https://github.com/liltom-eth/llama2-webui/blob/main/docs/news.md), [Benchmark](https://github.com/liltom-eth/llama2-webui/blob/main/docs/performance.md), [Issue Solutions](https://github.com/liltom-eth/llama2-webui/blob/main/docs/issues.md) 15 | 16 | [llama2-wrapper](https://pypi.org/project/llama2-wrapper/) is the backend and part of [llama2-webui](https://github.com/liltom-eth/llama2-webui), which can run any Llama 2 locally with gradio UI on GPU or CPU from anywhere (Linux/Windows/Mac). 17 | 18 | ## Install 19 | 20 | ```bash 21 | pip install llama2-wrapper 22 | ``` 23 | 24 | ## Start OpenAI Compatible API 25 | 26 | ``` 27 | python -m llama2_wrapper.server 28 | ``` 29 | 30 | it will use `llama.cpp` as the backend by default to run `llama-2-7b-chat.ggmlv3.q4_0.bin` model. 31 | 32 | Start Fast API for `gptq` backend: 33 | 34 | ``` 35 | python -m llama2_wrapper.server --backend_type gptq 36 | ``` 37 | 38 | Navigate to http://localhost:8000/docs to see the OpenAPI documentation. 39 | 40 | ## API Usage 41 | 42 | ### `__call__` 43 | 44 | `__call__()` is the function to generate text from a prompt. 45 | 46 | For example, run ggml llama2 model on CPU, [colab example](https://github.com/liltom-eth/llama2-webui/blob/main/colab/ggmlv3_q4_0.ipynb): 47 | 48 | ```python 49 | from llama2_wrapper import LLAMA2_WRAPPER, get_prompt 50 | llama2_wrapper = LLAMA2_WRAPPER() 51 | # Default running on backend llama.cpp. 52 | # Automatically downloading model to: ./models/llama-2-7b-chat.ggmlv3.q4_0.bin 53 | prompt = "Do you know Pytorch" 54 | # llama2_wrapper() will run __call__() 55 | answer = llama2_wrapper(get_prompt(prompt), temperature=0.9) 56 | ``` 57 | 58 | Run gptq llama2 model on Nvidia GPU, [colab example](https://github.com/liltom-eth/llama2-webui/blob/main/colab/Llama_2_7b_Chat_GPTQ.ipynb): 59 | 60 | ```python 61 | from llama2_wrapper import LLAMA2_WRAPPER 62 | llama2_wrapper = LLAMA2_WRAPPER(backend_type="gptq") 63 | # Automatically downloading model to: ./models/Llama-2-7b-Chat-GPTQ 64 | ``` 65 | 66 | Run llama2 7b with bitsandbytes 8 bit with a `model_path`: 67 | 68 | ```python 69 | from llama2_wrapper import LLAMA2_WRAPPER 70 | llama2_wrapper = LLAMA2_WRAPPER( 71 | model_path = "./models/Llama-2-7b-chat-hf", 72 | backend_type = "transformers", 73 | load_in_8bit = True 74 | ) 75 | ``` 76 | 77 | ### completion 78 | 79 | `completion()` is the function to generate text from a prompt for OpenAI compatible API `/v1/completions`. 80 | 81 | ```python 82 | llama2_wrapper = LLAMA2_WRAPPER() 83 | prompt = get_prompt("Hi do you know Pytorch?") 84 | print(llm.completion(prompt)) 85 | ``` 86 | 87 | ### chat_completion 88 | 89 | `chat_completion()` is the function to generate text from a dialog (chat history) for OpenAI compatible API `/v1/chat/completions`. 90 | 91 | ```python 92 | llama2_wrapper = LLAMA2_WRAPPER() 93 | dialog = [ 94 | { 95 | "role":"system", 96 | "content":"You are a helpful, respectful and honest assistant. " 97 | },{ 98 | "role":"user", 99 | "content":"Hi do you know Pytorch?", 100 | }, 101 | ] 102 | print(llm.chat_completion(dialog)) 103 | ``` 104 | 105 | ### generate 106 | 107 | `generate()` is the function to create a generator of response from a prompt. 108 | 109 | This is useful when you want to stream the output like typing in the chatbot. 110 | 111 | ```python 112 | llama2_wrapper = LLAMA2_WRAPPER() 113 | prompt = get_prompt("Hi do you know Pytorch?") 114 | for response in llama2_wrapper.generate(prompt): 115 | print(response) 116 | 117 | ``` 118 | 119 | The response will be like: 120 | 121 | ``` 122 | Yes, 123 | Yes, I'm 124 | Yes, I'm familiar 125 | Yes, I'm familiar with 126 | Yes, I'm familiar with PyTorch! 127 | ... 128 | ``` 129 | 130 | ### run 131 | 132 | `run()` is similar to `generate()`, but `run()`can also accept `chat_history`and `system_prompt` from the users. 133 | 134 | It will process the input message to llama2 prompt template with `chat_history` and `system_prompt` for a chatbot-like app. 135 | 136 | ### get_prompt 137 | 138 | `get_prompt()` will process the input message to llama2 prompt with `chat_history` and `system_prompt`for chatbot. 139 | 140 | By default, `chat_history` and `system_prompt` are empty and `get_prompt()` will add llama2 prompt template to your message: 141 | 142 | ```python 143 | prompt = get_prompt("Hi do you know Pytorch?") 144 | ``` 145 | 146 | prompt will be: 147 | 148 | ``` 149 | [INST] <> 150 | 151 | <> 152 | 153 | Hi do you know Pytorch? [/INST] 154 | ``` 155 | 156 | If use `get_prompt("Hi do you know Pytorch?", system_prompt="You are a helpful...")`: 157 | 158 | ``` 159 | [INST] <> 160 | You are a helpful, respectful and honest assistant. 161 | <> 162 | 163 | Hi do you know Pytorch? [/INST] 164 | ``` 165 | 166 | ### get_prompt_for_dialog 167 | 168 | `get_prompt_for_dialog()` will process dialog (chat history) to llama2 prompt for OpenAI compatible API `/v1/chat/completions`. 169 | 170 | ```python 171 | dialog = [ 172 | { 173 | "role":"system", 174 | "content":"You are a helpful, respectful and honest assistant. " 175 | },{ 176 | "role":"user", 177 | "content":"Hi do you know Pytorch?", 178 | }, 179 | ] 180 | prompt = get_prompt_for_dialog("Hi do you know Pytorch?") 181 | # [INST] <> 182 | # You are a helpful, respectful and honest assistant. 183 | # <> 184 | # 185 | # Hi do you know Pytorch? [/INST] 186 | ``` 187 | 188 | -------------------------------------------------------------------------------- /code_completion.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import gradio as gr 4 | from llama2_wrapper import LLAMA2_WRAPPER 5 | 6 | FIM_PREFIX = "
 "
  7 | FIM_MIDDLE = " "
  8 | FIM_SUFFIX = " "
  9 | 
 10 | FIM_INDICATOR = ""
 11 | 
 12 | EOS_STRING = ""
 13 | EOT_STRING = ""
 14 | 
 15 | 
 16 | def main():
 17 |     parser = argparse.ArgumentParser()
 18 |     parser.add_argument(
 19 |         "--model_path",
 20 |         type=str,
 21 |         default="./models/codellama-7b-instruct.ggmlv3.Q4_0.bin",
 22 |         help="model path",
 23 |     )
 24 |     parser.add_argument(
 25 |         "--backend_type",
 26 |         type=str,
 27 |         default="llama.cpp",
 28 |         help="Backend options: llama.cpp, gptq, transformers",
 29 |     )
 30 |     parser.add_argument(
 31 |         "--max_tokens",
 32 |         type=int,
 33 |         default=4000,
 34 |         help="Maximum context size.",
 35 |     )
 36 |     parser.add_argument(
 37 |         "--load_in_8bit",
 38 |         type=bool,
 39 |         default=False,
 40 |         help="Whether to use bitsandbytes 8 bit.",
 41 |     )
 42 |     parser.add_argument(
 43 |         "--share",
 44 |         type=bool,
 45 |         default=False,
 46 |         help="Whether to share public for gradio.",
 47 |     )
 48 |     args = parser.parse_args()
 49 | 
 50 |     llama2_wrapper = LLAMA2_WRAPPER(
 51 |         model_path=args.model_path,
 52 |         backend_type=args.backend_type,
 53 |         max_tokens=args.max_tokens,
 54 |         load_in_8bit=args.load_in_8bit,
 55 |     )
 56 | 
 57 |     def generate(
 58 |         prompt,
 59 |         temperature=0.9,
 60 |         max_new_tokens=256,
 61 |         top_p=0.95,
 62 |         repetition_penalty=1.0,
 63 |     ):
 64 |         temperature = float(temperature)
 65 |         if temperature < 1e-2:
 66 |             temperature = 1e-2
 67 |         top_p = float(top_p)
 68 |         fim_mode = False
 69 | 
 70 |         generate_kwargs = dict(
 71 |             temperature=temperature,
 72 |             max_new_tokens=max_new_tokens,
 73 |             top_p=top_p,
 74 |             repetition_penalty=repetition_penalty,
 75 |             stream=True,
 76 |         )
 77 | 
 78 |         if FIM_INDICATOR in prompt:
 79 |             fim_mode = True
 80 |             try:
 81 |                 prefix, suffix = prompt.split(FIM_INDICATOR)
 82 |             except:
 83 |                 raise ValueError(f"Only one {FIM_INDICATOR} allowed in prompt!")
 84 |             prompt = f"{FIM_PREFIX}{prefix}{FIM_SUFFIX}{suffix}{FIM_MIDDLE}"
 85 | 
 86 |         stream = llama2_wrapper.__call__(prompt, **generate_kwargs)
 87 | 
 88 |         if fim_mode:
 89 |             output = prefix
 90 |         else:
 91 |             output = prompt
 92 | 
 93 |         # for response in stream:
 94 |         #     output += response
 95 |         #     yield output
 96 |         # return output
 97 | 
 98 |         previous_token = ""
 99 |         for response in stream:
100 |             if any([end_token in response for end_token in [EOS_STRING, EOT_STRING]]):
101 |                 if fim_mode:
102 |                     output += suffix
103 |                     yield output
104 |                     return output
105 |                     print("output", output)
106 |                 else:
107 |                     return output
108 |             else:
109 |                 output += response
110 |             previous_token = response
111 |             yield output
112 |         return output
113 | 
114 |     examples = [
115 |         'def remove_non_ascii(s: str) -> str:\n    """ \nprint(remove_non_ascii(\'afkdj$$(\'))',
116 |         "X_train, y_train, X_test, y_test = train_test_split(X, y, test_size=0.1)\n\n# Train a logistic regression model, predict the labels on the test set and compute the accuracy score",
117 |         "// Returns every other value in the array as a new array.\nfunction everyOther(arr) {",
118 |         "Poor English: She no went to the market. Corrected English:",
119 |         "def alternating(list1, list2):\n   results = []\n   for i in range(min(len(list1), len(list2))):\n       results.append(list1[i])\n       results.append(list2[i])\n   if len(list1) > len(list2):\n       \n   else:\n       results.extend(list2[i+1:])\n   return results",
120 |     ]
121 | 
122 |     def process_example(args):
123 |         for x in generate(args):
124 |             pass
125 |         return x
126 | 
127 |     description = """
128 |     
129 |

Code Llama Playground

130 | 131 |
132 |
133 |

This is a demo to complete code with Code Llama. For instruction purposes, please use llama2-webui app.py with CodeLlama-Instruct models.

134 |
135 | """ 136 | with gr.Blocks() as demo: 137 | with gr.Column(): 138 | gr.Markdown(description) 139 | with gr.Row(): 140 | with gr.Column(): 141 | instruction = gr.Textbox( 142 | placeholder="Enter your code here", 143 | lines=5, 144 | label="Input", 145 | elem_id="q-input", 146 | ) 147 | submit = gr.Button("Generate", variant="primary") 148 | output = gr.Code(elem_id="q-output", lines=30, label="Output") 149 | with gr.Row(): 150 | with gr.Column(): 151 | with gr.Accordion("Advanced settings", open=False): 152 | with gr.Row(): 153 | column_1, column_2 = gr.Column(), gr.Column() 154 | with column_1: 155 | temperature = gr.Slider( 156 | label="Temperature", 157 | value=0.1, 158 | minimum=0.0, 159 | maximum=1.0, 160 | step=0.05, 161 | interactive=True, 162 | info="Higher values produce more diverse outputs", 163 | ) 164 | max_new_tokens = gr.Slider( 165 | label="Max new tokens", 166 | value=256, 167 | minimum=0, 168 | maximum=8192, 169 | step=64, 170 | interactive=True, 171 | info="The maximum numbers of new tokens", 172 | ) 173 | with column_2: 174 | top_p = gr.Slider( 175 | label="Top-p (nucleus sampling)", 176 | value=0.90, 177 | minimum=0.0, 178 | maximum=1, 179 | step=0.05, 180 | interactive=True, 181 | info="Higher values sample more low-probability tokens", 182 | ) 183 | repetition_penalty = gr.Slider( 184 | label="Repetition penalty", 185 | value=1.05, 186 | minimum=1.0, 187 | maximum=2.0, 188 | step=0.05, 189 | interactive=True, 190 | info="Penalize repeated tokens", 191 | ) 192 | 193 | gr.Examples( 194 | examples=examples, 195 | inputs=[instruction], 196 | cache_examples=False, 197 | fn=process_example, 198 | outputs=[output], 199 | ) 200 | 201 | submit.click( 202 | generate, 203 | inputs=[ 204 | instruction, 205 | temperature, 206 | max_new_tokens, 207 | top_p, 208 | repetition_penalty, 209 | ], 210 | outputs=[output], 211 | ) 212 | demo.queue(concurrency_count=16).launch(share=args.share) 213 | 214 | 215 | if __name__ == "__main__": 216 | main() 217 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | from typing import Iterator 4 | 5 | import gradio as gr 6 | from dotenv import load_dotenv 7 | from distutils.util import strtobool 8 | 9 | from llama2_wrapper import LLAMA2_WRAPPER 10 | 11 | import logging 12 | 13 | from prompts.utils import PromtsContainer 14 | 15 | 16 | def main(): 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument("--model_path", type=str, default="", help="model path") 19 | parser.add_argument( 20 | "--backend_type", 21 | type=str, 22 | default="", 23 | help="Backend options: llama.cpp, gptq, transformers", 24 | ) 25 | parser.add_argument( 26 | "--load_in_8bit", 27 | type=bool, 28 | default=False, 29 | help="Whether to use bitsandbytes 8 bit.", 30 | ) 31 | parser.add_argument( 32 | "--share", 33 | type=bool, 34 | default=False, 35 | help="Whether to share public for gradio.", 36 | ) 37 | args = parser.parse_args() 38 | 39 | load_dotenv() 40 | 41 | DEFAULT_SYSTEM_PROMPT = os.getenv("DEFAULT_SYSTEM_PROMPT", "") 42 | MAX_MAX_NEW_TOKENS = int(os.getenv("MAX_MAX_NEW_TOKENS", 2048)) 43 | DEFAULT_MAX_NEW_TOKENS = int(os.getenv("DEFAULT_MAX_NEW_TOKENS", 1024)) 44 | MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", 4000)) 45 | 46 | MODEL_PATH = os.getenv("MODEL_PATH") 47 | assert MODEL_PATH is not None, f"MODEL_PATH is required, got: {MODEL_PATH}" 48 | BACKEND_TYPE = os.getenv("BACKEND_TYPE") 49 | assert BACKEND_TYPE is not None, f"BACKEND_TYPE is required, got: {BACKEND_TYPE}" 50 | 51 | LOAD_IN_8BIT = bool(strtobool(os.getenv("LOAD_IN_8BIT", "True"))) 52 | 53 | if args.model_path != "": 54 | MODEL_PATH = args.model_path 55 | if args.backend_type != "": 56 | BACKEND_TYPE = args.backend_type 57 | if args.load_in_8bit: 58 | LOAD_IN_8BIT = True 59 | 60 | llama2_wrapper = LLAMA2_WRAPPER( 61 | model_path=MODEL_PATH, 62 | backend_type=BACKEND_TYPE, 63 | max_tokens=MAX_INPUT_TOKEN_LENGTH, 64 | load_in_8bit=LOAD_IN_8BIT, 65 | # verbose=True, 66 | ) 67 | 68 | DESCRIPTION = """ 69 | # llama2-webui 70 | """ 71 | DESCRIPTION2 = """ 72 | - Supporting models: [Llama-2-7b](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML)/[13b](https://huggingface.co/llamaste/Llama-2-13b-chat-hf)/[70b](https://huggingface.co/llamaste/Llama-2-70b-chat-hf), [Llama-2-GPTQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-GPTQ), [Llama-2-GGML](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML), [CodeLlama](https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GPTQ) ... 73 | - Supporting model backends: [tranformers](https://github.com/huggingface/transformers), [bitsandbytes(8-bit inference)](https://github.com/TimDettmers/bitsandbytes), [AutoGPTQ(4-bit inference)](https://github.com/PanQiWei/AutoGPTQ), [llama.cpp](https://github.com/ggerganov/llama.cpp) 74 | """ 75 | 76 | def clear_and_save_textbox(message: str) -> tuple[str, str]: 77 | return "", message 78 | 79 | def save_textbox_for_prompt(message: str) -> str: 80 | logging.info("start save_textbox_from_prompt") 81 | message = convert_summary_to_prompt(message) 82 | return message 83 | 84 | def display_input( 85 | message: str, history: list[tuple[str, str]] 86 | ) -> list[tuple[str, str]]: 87 | history.append((message, "")) 88 | return history 89 | 90 | def delete_prev_fn( 91 | history: list[tuple[str, str]] 92 | ) -> tuple[list[tuple[str, str]], str]: 93 | try: 94 | message, _ = history.pop() 95 | except IndexError: 96 | message = "" 97 | return history, message or "" 98 | 99 | def generate( 100 | message: str, 101 | history_with_input: list[tuple[str, str]], 102 | system_prompt: str, 103 | max_new_tokens: int, 104 | temperature: float, 105 | top_p: float, 106 | top_k: int, 107 | ) -> Iterator[list[tuple[str, str]]]: 108 | if max_new_tokens > MAX_MAX_NEW_TOKENS: 109 | raise ValueError 110 | try: 111 | history = history_with_input[:-1] 112 | generator = llama2_wrapper.run( 113 | message, 114 | history, 115 | system_prompt, 116 | max_new_tokens, 117 | temperature, 118 | top_p, 119 | top_k, 120 | ) 121 | try: 122 | first_response = next(generator) 123 | yield history + [(message, first_response)] 124 | except StopIteration: 125 | yield history + [(message, "")] 126 | for response in generator: 127 | yield history + [(message, response)] 128 | except Exception as e: 129 | logging.exception(e) 130 | 131 | def check_input_token_length( 132 | message: str, chat_history: list[tuple[str, str]], system_prompt: str 133 | ) -> None: 134 | input_token_length = llama2_wrapper.get_input_token_length( 135 | message, chat_history, system_prompt 136 | ) 137 | if input_token_length > MAX_INPUT_TOKEN_LENGTH: 138 | raise gr.Error( 139 | f"The accumulated input is too long ({input_token_length} > {MAX_INPUT_TOKEN_LENGTH}). Clear your chat history and try again." 140 | ) 141 | 142 | prompts_container = PromtsContainer() 143 | prompts = prompts_container.get_prompts_tab_dict() 144 | default_prompts_checkbox = False 145 | default_advanced_checkbox = False 146 | 147 | def convert_summary_to_prompt(summary): 148 | return prompts_container.get_prompt_by_summary(summary) 149 | 150 | def two_columns_list(tab_data, chatbot): 151 | result = [] 152 | for i in range(int(len(tab_data) / 2) + 1): 153 | row = gr.Row() 154 | with row: 155 | for j in range(2): 156 | index = 2 * i + j 157 | if index >= len(tab_data): 158 | break 159 | item = tab_data[index] 160 | with gr.Group(): 161 | gr.HTML( 162 | f'

{item["act"]}

' 163 | ) 164 | prompt_text = gr.Button( 165 | label="", 166 | value=f"{item['summary']}", 167 | size="sm", 168 | elem_classes="text-left-aligned", 169 | ) 170 | prompt_text.click( 171 | fn=save_textbox_for_prompt, 172 | inputs=prompt_text, 173 | outputs=saved_input, 174 | api_name=False, 175 | queue=True, 176 | ).then( 177 | fn=display_input, 178 | inputs=[saved_input, chatbot], 179 | outputs=chatbot, 180 | api_name=False, 181 | queue=True, 182 | ).then( 183 | fn=check_input_token_length, 184 | inputs=[saved_input, chatbot, system_prompt], 185 | api_name=False, 186 | queue=False, 187 | ).success( 188 | fn=generate, 189 | inputs=[ 190 | saved_input, 191 | chatbot, 192 | system_prompt, 193 | max_new_tokens, 194 | temperature, 195 | top_p, 196 | top_k, 197 | ], 198 | outputs=chatbot, 199 | api_name=False, 200 | ) 201 | result.append(row) 202 | return result 203 | 204 | CSS = """ 205 | .contain { display: flex; flex-direction: column;} 206 | #component-0 #component-1 #component-2 #component-4 #component-5 { height:71vh !important; } 207 | #component-0 #component-1 #component-24 > div:nth-child(2) { height:80vh !important; overflow-y:auto } 208 | .text-left-aligned {text-align: left !important; font-size: 16px;} 209 | """ 210 | with gr.Blocks(css=CSS) as demo: 211 | with gr.Row(equal_height=True): 212 | with gr.Column(scale=2): 213 | gr.Markdown(DESCRIPTION) 214 | with gr.Group(): 215 | chatbot = gr.Chatbot(label="Chatbot") 216 | with gr.Row(): 217 | textbox = gr.Textbox( 218 | container=False, 219 | show_label=False, 220 | placeholder="Type a message...", 221 | scale=10, 222 | ) 223 | submit_button = gr.Button( 224 | "Submit", variant="primary", scale=1, min_width=0 225 | ) 226 | with gr.Row(): 227 | retry_button = gr.Button("🔄 Retry", variant="secondary") 228 | undo_button = gr.Button("↩️ Undo", variant="secondary") 229 | clear_button = gr.Button("🗑️ Clear", variant="secondary") 230 | 231 | saved_input = gr.State() 232 | with gr.Row(): 233 | advanced_checkbox = gr.Checkbox( 234 | label="Advanced", 235 | value=default_prompts_checkbox, 236 | container=False, 237 | elem_classes="min_check", 238 | ) 239 | prompts_checkbox = gr.Checkbox( 240 | label="Prompts", 241 | value=default_prompts_checkbox, 242 | container=False, 243 | elem_classes="min_check", 244 | ) 245 | with gr.Column(visible=default_advanced_checkbox) as advanced_column: 246 | system_prompt = gr.Textbox( 247 | label="System prompt", value=DEFAULT_SYSTEM_PROMPT, lines=6 248 | ) 249 | max_new_tokens = gr.Slider( 250 | label="Max new tokens", 251 | minimum=1, 252 | maximum=MAX_MAX_NEW_TOKENS, 253 | step=1, 254 | value=DEFAULT_MAX_NEW_TOKENS, 255 | ) 256 | temperature = gr.Slider( 257 | label="Temperature", 258 | minimum=0.1, 259 | maximum=4.0, 260 | step=0.1, 261 | value=1.0, 262 | ) 263 | top_p = gr.Slider( 264 | label="Top-p (nucleus sampling)", 265 | minimum=0.05, 266 | maximum=1.0, 267 | step=0.05, 268 | value=0.95, 269 | ) 270 | top_k = gr.Slider( 271 | label="Top-k", 272 | minimum=1, 273 | maximum=1000, 274 | step=1, 275 | value=50, 276 | ) 277 | with gr.Column(scale=1, visible=default_prompts_checkbox) as prompt_column: 278 | gr.HTML( 279 | '

\N{four leaf clover} prompts

' 280 | ) 281 | for k, v in prompts.items(): 282 | with gr.Tab(k, scroll_to_output=True): 283 | lst = two_columns_list(v, chatbot) 284 | prompts_checkbox.change( 285 | lambda x: gr.update(visible=x), 286 | prompts_checkbox, 287 | prompt_column, 288 | queue=False, 289 | ) 290 | advanced_checkbox.change( 291 | lambda x: gr.update(visible=x), 292 | advanced_checkbox, 293 | advanced_column, 294 | queue=False, 295 | ) 296 | 297 | textbox.submit( 298 | fn=clear_and_save_textbox, 299 | inputs=textbox, 300 | outputs=[textbox, saved_input], 301 | api_name=False, 302 | queue=False, 303 | ).then( 304 | fn=display_input, 305 | inputs=[saved_input, chatbot], 306 | outputs=chatbot, 307 | api_name=False, 308 | queue=False, 309 | ).then( 310 | fn=check_input_token_length, 311 | inputs=[saved_input, chatbot, system_prompt], 312 | api_name=False, 313 | queue=False, 314 | ).success( 315 | fn=generate, 316 | inputs=[ 317 | saved_input, 318 | chatbot, 319 | system_prompt, 320 | max_new_tokens, 321 | temperature, 322 | top_p, 323 | top_k, 324 | ], 325 | outputs=chatbot, 326 | api_name=False, 327 | ) 328 | 329 | button_event_preprocess = ( 330 | submit_button.click( 331 | fn=clear_and_save_textbox, 332 | inputs=textbox, 333 | outputs=[textbox, saved_input], 334 | api_name=False, 335 | queue=False, 336 | ) 337 | .then( 338 | fn=display_input, 339 | inputs=[saved_input, chatbot], 340 | outputs=chatbot, 341 | api_name=False, 342 | queue=False, 343 | ) 344 | .then( 345 | fn=check_input_token_length, 346 | inputs=[saved_input, chatbot, system_prompt], 347 | api_name=False, 348 | queue=False, 349 | ) 350 | .success( 351 | fn=generate, 352 | inputs=[ 353 | saved_input, 354 | chatbot, 355 | system_prompt, 356 | max_new_tokens, 357 | temperature, 358 | top_p, 359 | top_k, 360 | ], 361 | outputs=chatbot, 362 | api_name=False, 363 | ) 364 | ) 365 | 366 | retry_button.click( 367 | fn=delete_prev_fn, 368 | inputs=chatbot, 369 | outputs=[chatbot, saved_input], 370 | api_name=False, 371 | queue=False, 372 | ).then( 373 | fn=display_input, 374 | inputs=[saved_input, chatbot], 375 | outputs=chatbot, 376 | api_name=False, 377 | queue=False, 378 | ).then( 379 | fn=generate, 380 | inputs=[ 381 | saved_input, 382 | chatbot, 383 | system_prompt, 384 | max_new_tokens, 385 | temperature, 386 | top_p, 387 | top_k, 388 | ], 389 | outputs=chatbot, 390 | api_name=False, 391 | ) 392 | 393 | undo_button.click( 394 | fn=delete_prev_fn, 395 | inputs=chatbot, 396 | outputs=[chatbot, saved_input], 397 | api_name=False, 398 | queue=False, 399 | ).then( 400 | fn=lambda x: x, 401 | inputs=[saved_input], 402 | outputs=textbox, 403 | api_name=False, 404 | queue=False, 405 | ) 406 | 407 | clear_button.click( 408 | fn=lambda: ([], ""), 409 | outputs=[chatbot, saved_input], 410 | queue=False, 411 | api_name=False, 412 | ) 413 | 414 | demo.queue(max_size=20).launch(share=args.share) 415 | 416 | 417 | if __name__ == "__main__": 418 | main() 419 | -------------------------------------------------------------------------------- /llama2_wrapper/server/app.py: -------------------------------------------------------------------------------- 1 | import json 2 | import multiprocessing 3 | from re import compile, Match, Pattern 4 | from threading import Lock 5 | from functools import partial 6 | from typing import Callable, Coroutine, Iterator, List, Optional, Tuple, Union, Dict 7 | from typing_extensions import TypedDict, Literal 8 | 9 | import anyio 10 | from anyio.streams.memory import MemoryObjectSendStream 11 | from starlette.concurrency import run_in_threadpool, iterate_in_threadpool 12 | from fastapi import Depends, FastAPI, APIRouter, Request, Response 13 | from fastapi.middleware.cors import CORSMiddleware 14 | from fastapi.responses import JSONResponse 15 | from fastapi.routing import APIRoute 16 | from pydantic import BaseModel, Field 17 | from pydantic_settings import BaseSettings 18 | from sse_starlette.sse import EventSourceResponse 19 | 20 | from llama2_wrapper.model import LLAMA2_WRAPPER 21 | from llama2_wrapper.types import ( 22 | Completion, 23 | CompletionChunk, 24 | ChatCompletion, 25 | ChatCompletionChunk, 26 | ) 27 | 28 | 29 | class Settings(BaseSettings): 30 | model_path: str = Field( 31 | default="", 32 | description="The path to the model to use for generating completions.", 33 | ) 34 | backend_type: str = Field( 35 | default="llama.cpp", 36 | description="Backend for llama2, options: llama.cpp, gptq, transformers", 37 | ) 38 | max_tokens: int = Field(default=4000, ge=1, description="Maximum context size.") 39 | load_in_8bit: bool = Field( 40 | default=False, 41 | description="`Whether to use bitsandbytes to run model in 8 bit mode (only for transformers models).", 42 | ) 43 | verbose: bool = Field( 44 | default=False, 45 | description="Whether to print verbose output to stderr.", 46 | ) 47 | host: str = Field(default="localhost", description="API address") 48 | port: int = Field(default=8000, description="API port") 49 | interrupt_requests: bool = Field( 50 | default=True, 51 | description="Whether to interrupt requests when a new request is received.", 52 | ) 53 | 54 | 55 | class ErrorResponse(TypedDict): 56 | """OpenAI style error response""" 57 | 58 | message: str 59 | type: str 60 | param: Optional[str] 61 | code: Optional[str] 62 | 63 | 64 | class ErrorResponseFormatters: 65 | """Collection of formatters for error responses. 66 | 67 | Args: 68 | request (Union[CreateCompletionRequest, CreateChatCompletionRequest]): 69 | Request body 70 | match (Match[str]): Match object from regex pattern 71 | 72 | Returns: 73 | Tuple[int, ErrorResponse]: Status code and error response 74 | """ 75 | 76 | @staticmethod 77 | def context_length_exceeded( 78 | request: Union["CreateCompletionRequest", "CreateChatCompletionRequest"], 79 | match, # type: Match[str] # type: ignore 80 | ) -> Tuple[int, ErrorResponse]: 81 | """Formatter for context length exceeded error""" 82 | 83 | context_window = int(match.group(2)) 84 | prompt_tokens = int(match.group(1)) 85 | completion_tokens = request.max_new_tokens 86 | if hasattr(request, "messages"): 87 | # Chat completion 88 | message = ( 89 | "This model's maximum context length is {} tokens. " 90 | "However, you requested {} tokens " 91 | "({} in the messages, {} in the completion). " 92 | "Please reduce the length of the messages or completion." 93 | ) 94 | else: 95 | # Text completion 96 | message = ( 97 | "This model's maximum context length is {} tokens, " 98 | "however you requested {} tokens " 99 | "({} in your prompt; {} for the completion). " 100 | "Please reduce your prompt; or completion length." 101 | ) 102 | return 400, ErrorResponse( 103 | message=message.format( 104 | context_window, 105 | completion_tokens + prompt_tokens, 106 | prompt_tokens, 107 | completion_tokens, 108 | ), 109 | type="invalid_request_error", 110 | param="messages", 111 | code="context_length_exceeded", 112 | ) 113 | 114 | @staticmethod 115 | def model_not_found( 116 | request: Union["CreateCompletionRequest", "CreateChatCompletionRequest"], 117 | match, # type: Match[str] # type: ignore 118 | ) -> Tuple[int, ErrorResponse]: 119 | """Formatter for model_not_found error""" 120 | 121 | model_path = str(match.group(1)) 122 | message = f"The model `{model_path}` does not exist" 123 | return 400, ErrorResponse( 124 | message=message, 125 | type="invalid_request_error", 126 | param=None, 127 | code="model_not_found", 128 | ) 129 | 130 | 131 | class RouteErrorHandler(APIRoute): 132 | """Custom APIRoute that handles application errors and exceptions""" 133 | 134 | # key: regex pattern for original error message from llama_cpp 135 | # value: formatter function 136 | pattern_and_formatters: Dict[ 137 | "Pattern", 138 | Callable[ 139 | [ 140 | Union["CreateCompletionRequest", "CreateChatCompletionRequest"], 141 | "Match[str]", 142 | ], 143 | Tuple[int, ErrorResponse], 144 | ], 145 | ] = { 146 | compile( 147 | r"Requested tokens \((\d+)\) exceed context window of (\d+)" 148 | ): ErrorResponseFormatters.context_length_exceeded, 149 | compile( 150 | r"Model path does not exist: (.+)" 151 | ): ErrorResponseFormatters.model_not_found, 152 | } 153 | 154 | def error_message_wrapper( 155 | self, 156 | error: Exception, 157 | body: Optional[ 158 | Union[ 159 | "CreateChatCompletionRequest", 160 | "CreateCompletionRequest", 161 | ] 162 | ] = None, 163 | ) -> Tuple[int, ErrorResponse]: 164 | """Wraps error message in OpenAI style error response""" 165 | 166 | if body is not None and isinstance( 167 | body, 168 | ( 169 | CreateCompletionRequest, 170 | CreateChatCompletionRequest, 171 | ), 172 | ): 173 | # When text completion or chat completion 174 | for pattern, callback in self.pattern_and_formatters.items(): 175 | match = pattern.search(str(error)) 176 | if match is not None: 177 | return callback(body, match) 178 | 179 | # Wrap other errors as internal server error 180 | return 500, ErrorResponse( 181 | message=str(error), 182 | type="internal_server_error", 183 | param=None, 184 | code=None, 185 | ) 186 | 187 | def get_route_handler( 188 | self, 189 | ) -> Callable[[Request], Coroutine[None, None, Response]]: 190 | """Defines custom route handler that catches exceptions and formats 191 | in OpenAI style error response""" 192 | 193 | original_route_handler = super().get_route_handler() 194 | 195 | async def custom_route_handler(request: Request) -> Response: 196 | try: 197 | return await original_route_handler(request) 198 | except Exception as exc: 199 | json_body = await request.json() 200 | try: 201 | if "messages" in json_body: 202 | # Chat completion 203 | body: Optional[ 204 | Union[ 205 | CreateChatCompletionRequest, 206 | CreateCompletionRequest, 207 | ] 208 | ] = CreateChatCompletionRequest(**json_body) 209 | elif "prompt" in json_body: 210 | # Text completion 211 | body = CreateCompletionRequest(**json_body) 212 | # else: 213 | # # Embedding 214 | # body = CreateEmbeddingRequest(**json_body) 215 | except Exception: 216 | # Invalid request body 217 | body = None 218 | 219 | # Get proper error message from the exception 220 | ( 221 | status_code, 222 | error_message, 223 | ) = self.error_message_wrapper(error=exc, body=body) 224 | return JSONResponse( 225 | {"error": error_message}, 226 | status_code=status_code, 227 | ) 228 | 229 | return custom_route_handler 230 | 231 | 232 | router = APIRouter(route_class=RouteErrorHandler) 233 | 234 | settings: Optional[Settings] = None 235 | llama2: Optional[LLAMA2_WRAPPER] = None 236 | 237 | 238 | def create_app(settings: Optional[Settings] = None): 239 | if settings is None: 240 | settings = Settings() 241 | app = FastAPI( 242 | title="llama2-wrapper Fast API", 243 | version="0.0.1", 244 | ) 245 | app.add_middleware( 246 | CORSMiddleware, 247 | allow_origins=["*"], 248 | allow_credentials=True, 249 | allow_methods=["*"], 250 | allow_headers=["*"], 251 | ) 252 | app.include_router(router) 253 | global llama2 254 | llama2 = LLAMA2_WRAPPER( 255 | model_path=settings.model_path, 256 | backend_type=settings.backend_type, 257 | max_tokens=settings.max_tokens, 258 | load_in_8bit=settings.load_in_8bit, 259 | verbose=settings.load_in_8bit, 260 | ) 261 | 262 | def set_settings(_settings: Settings): 263 | global settings 264 | settings = _settings 265 | 266 | set_settings(settings) 267 | return app 268 | 269 | 270 | llama_outer_lock = Lock() 271 | llama_inner_lock = Lock() 272 | 273 | 274 | def get_llama(): 275 | # NOTE: This double lock allows the currently streaming llama model to 276 | # check if any other requests are pending in the same thread and cancel 277 | # the stream if so. 278 | llama_outer_lock.acquire() 279 | release_outer_lock = True 280 | try: 281 | llama_inner_lock.acquire() 282 | try: 283 | llama_outer_lock.release() 284 | release_outer_lock = False 285 | yield llama2 286 | finally: 287 | llama_inner_lock.release() 288 | finally: 289 | if release_outer_lock: 290 | llama_outer_lock.release() 291 | 292 | 293 | def get_settings(): 294 | yield settings 295 | 296 | 297 | async def get_event_publisher( 298 | request: Request, 299 | inner_send_chan: MemoryObjectSendStream, 300 | iterator: Iterator, 301 | ): 302 | async with inner_send_chan: 303 | try: 304 | async for chunk in iterate_in_threadpool(iterator): 305 | await inner_send_chan.send(dict(data=json.dumps(chunk))) 306 | if await request.is_disconnected(): 307 | raise anyio.get_cancelled_exc_class()() 308 | if settings.interrupt_requests and llama_outer_lock.locked(): 309 | await inner_send_chan.send(dict(data="[DONE]")) 310 | raise anyio.get_cancelled_exc_class()() 311 | await inner_send_chan.send(dict(data="[DONE]")) 312 | except anyio.get_cancelled_exc_class() as e: 313 | print("disconnected") 314 | with anyio.move_on_after(1, shield=True): 315 | print(f"Disconnected from client (via refresh/close) {request.client}") 316 | raise e 317 | 318 | 319 | stream_field = Field( 320 | default=False, 321 | description="Whether to stream the results as they are generated. Useful for chatbots.", 322 | ) 323 | max_new_tokens_field = Field( 324 | default=1000, ge=1, description="The maximum number of tokens to generate." 325 | ) 326 | 327 | temperature_field = Field( 328 | default=0.9, 329 | ge=0.0, 330 | le=2.0, 331 | description="The temperature to use for sampling.", 332 | ) 333 | 334 | top_p_field = Field( 335 | default=1.0, 336 | ge=0.0, 337 | le=1.0, 338 | description="The top-p value to use for sampling.", 339 | ) 340 | top_k_field = Field( 341 | default=40, 342 | ge=0, 343 | description="The top-k value to use for sampling.", 344 | ) 345 | repetition_penalty_field = Field( 346 | default=1.0, 347 | ge=0.0, 348 | description="The penalty to apply to repeated tokens.", 349 | ) 350 | # stop_field = Field( 351 | # default=None, 352 | # description="A list of tokens at which to stop generation. If None, no stop tokens are used.", 353 | # ) 354 | 355 | 356 | class CreateCompletionRequest(BaseModel): 357 | prompt: Union[str, List[str]] = Field( 358 | default="", description="The prompt to generate text from." 359 | ) 360 | stream: bool = stream_field 361 | max_new_tokens: int = max_new_tokens_field 362 | temperature: float = temperature_field 363 | top_p: float = top_p_field 364 | top_k: int = top_k_field 365 | repetition_penalty: float = repetition_penalty_field 366 | # stop: Optional[Union[str, List[str]]] = stop_field 367 | 368 | model_config = { 369 | "json_schema_extra": { 370 | "examples": [ 371 | { 372 | "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n", 373 | # "stop": ["\n", "###"], 374 | } 375 | ] 376 | } 377 | } 378 | 379 | 380 | @router.post( 381 | "/v1/completions", 382 | ) 383 | async def create_completion( 384 | request: Request, 385 | body: CreateCompletionRequest, 386 | llama2: LLAMA2_WRAPPER = Depends(get_llama), 387 | ) -> Completion: 388 | if isinstance(body.prompt, list): 389 | assert len(body.prompt) <= 1 390 | body.prompt = body.prompt[0] if len(body.prompt) > 0 else "" 391 | 392 | kwargs = body.model_dump() 393 | 394 | iterator_or_completion: Union[ 395 | Completion, Iterator[CompletionChunk] 396 | ] = await run_in_threadpool(llama2.completion, **kwargs) 397 | 398 | if isinstance(iterator_or_completion, Iterator): 399 | first_response = await run_in_threadpool(next, iterator_or_completion) 400 | 401 | # If no exception was raised from first_response, we can assume that 402 | # the iterator is valid and we can use it to stream the response. 403 | def iterator() -> Iterator[CompletionChunk]: 404 | yield first_response 405 | yield from iterator_or_completion 406 | 407 | send_chan, recv_chan = anyio.create_memory_object_stream(10) 408 | return EventSourceResponse( 409 | recv_chan, 410 | data_sender_callable=partial( # type: ignore 411 | get_event_publisher, 412 | request=request, 413 | inner_send_chan=send_chan, 414 | iterator=iterator(), 415 | ), 416 | ) 417 | else: 418 | return iterator_or_completion 419 | 420 | 421 | class ChatCompletionRequestMessage(BaseModel): 422 | role: Literal["system", "user", "assistant"] = Field( 423 | default="user", description="The role of the message." 424 | ) 425 | content: str = Field(default="", description="The content of the message.") 426 | 427 | 428 | class CreateChatCompletionRequest(BaseModel): 429 | messages: List[ChatCompletionRequestMessage] = Field( 430 | default=[], description="A list of messages to generate completions for." 431 | ) 432 | stream: bool = stream_field 433 | max_new_tokens: int = max_new_tokens_field 434 | temperature: float = temperature_field 435 | top_p: float = top_p_field 436 | top_k: int = top_k_field 437 | repetition_penalty: float = repetition_penalty_field 438 | # stop: Optional[List[str]] = stop_field 439 | 440 | model_config = { 441 | "json_schema_extra": { 442 | "examples": [ 443 | { 444 | "messages": [ 445 | ChatCompletionRequestMessage( 446 | role="system", content="You are a helpful assistant." 447 | ).model_dump(), 448 | ChatCompletionRequestMessage( 449 | role="user", content="What is the capital of France?" 450 | ).model_dump(), 451 | ] 452 | } 453 | ] 454 | } 455 | } 456 | 457 | 458 | @router.post( 459 | "/v1/chat/completions", 460 | ) 461 | async def create_chat_completion( 462 | request: Request, 463 | body: CreateChatCompletionRequest, 464 | llama2: LLAMA2_WRAPPER = Depends(get_llama), 465 | settings: Settings = Depends(get_settings), 466 | ) -> ChatCompletion: 467 | kwargs = body.model_dump() 468 | 469 | iterator_or_completion: Union[ 470 | ChatCompletion, Iterator[ChatCompletionChunk] 471 | ] = await run_in_threadpool(llama2.chat_completion, **kwargs) 472 | 473 | if isinstance(iterator_or_completion, Iterator): 474 | first_response = await run_in_threadpool(next, iterator_or_completion) 475 | 476 | # If no exception was raised from first_response, we can assume that 477 | # the iterator is valid and we can use it to stream the response. 478 | def iterator() -> Iterator[ChatCompletionChunk]: 479 | yield first_response 480 | yield from iterator_or_completion 481 | 482 | send_chan, recv_chan = anyio.create_memory_object_stream(10) 483 | return EventSourceResponse( 484 | recv_chan, 485 | data_sender_callable=partial( # type: ignore 486 | get_event_publisher, 487 | request=request, 488 | inner_send_chan=send_chan, 489 | iterator=iterator(), 490 | ), 491 | ) 492 | else: 493 | return iterator_or_completion 494 | 495 | 496 | class ModelData(TypedDict): 497 | id: str 498 | object: Literal["model"] 499 | owned_by: str 500 | permissions: List[str] 501 | 502 | 503 | class ModelList(TypedDict): 504 | object: Literal["list"] 505 | data: List[ModelData] 506 | 507 | 508 | @router.get("/v1/models") 509 | async def get_models( 510 | settings: Settings = Depends(get_settings), 511 | ) -> ModelList: 512 | assert llama2 is not None 513 | 514 | return { 515 | "object": "list", 516 | "data": [ 517 | { 518 | "id": settings.backend_type + " default model" 519 | if settings.model_path == "" 520 | else settings.model_path, 521 | "object": "model", 522 | "owned_by": "me", 523 | "permissions": [], 524 | } 525 | ], 526 | } 527 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # llama2-webui 2 | 3 | Running Llama 2 with gradio web UI on GPU or CPU from anywhere (Linux/Windows/Mac). 4 | - Supporting all Llama 2 models (7B, 13B, 70B, GPTQ, GGML, GGUF, [CodeLlama](https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GPTQ)) with 8-bit, 4-bit mode. 5 | - Use [llama2-wrapper](https://pypi.org/project/llama2-wrapper/) as your local llama2 backend for Generative Agents/Apps; [colab example](./colab/Llama_2_7b_Chat_GPTQ.ipynb). 6 | - [Run OpenAI Compatible API](#start-openai-compatible-api) on Llama2 models. 7 | 8 | ![screenshot](./static/screenshot.png) 9 | 10 | ![code_llama_playground](https://i.imgur.com/FgMUiT6.gif) 11 | 12 | ## Features 13 | 14 | - Supporting models: [Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)/[13b](https://huggingface.co/llamaste/Llama-2-13b-chat-hf)/[70b](https://huggingface.co/llamaste/Llama-2-70b-chat-hf), [Llama-2-GPTQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-GPTQ), [Llama-2-GGML](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML), [Llama-2-GGUF](https://huggingface.co/TheBloke/Llama-2-7b-Chat-GGUF), [CodeLlama](https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GPTQ) ... 15 | - Supporting model backends: [transformers](https://github.com/huggingface/transformers), [bitsandbytes(8-bit inference)](https://github.com/TimDettmers/bitsandbytes), [AutoGPTQ(4-bit inference)](https://github.com/PanQiWei/AutoGPTQ), [llama.cpp](https://github.com/ggerganov/llama.cpp) 16 | - Demos: [Run Llama2 on MacBook Air](https://twitter.com/liltom_eth/status/1682791729207070720?s=20); [Run Llama2 on free Colab T4 GPU](./colab/Llama_2_7b_Chat_GPTQ.ipynb) 17 | - Use [llama2-wrapper](https://pypi.org/project/llama2-wrapper/) as your local llama2 backend for Generative Agents/Apps; [colab example](./colab/Llama_2_7b_Chat_GPTQ.ipynb). 18 | - [Run OpenAI Compatible API](#start-openai-compatible-api) on Llama2 models. 19 | - [News](./docs/news.md), [Benchmark](./docs/performance.md), [Issue Solutions](./docs/issues.md) 20 | 21 | ## Contents 22 | 23 | - [Install](#install) 24 | - [Usage](#usage) 25 | - [Start Chat UI](#start-chat-ui) 26 | - [Start Code Llama UI](#start-code-llama-ui) 27 | - [Use llama2-wrapper for Your App](#use-llama2-wrapper-for-your-app) 28 | - [Start OpenAI Compatible API](#start-openai-compatible-api) 29 | - [Benchmark](#benchmark) 30 | - [Download Llama-2 Models](#download-llama-2-models) 31 | - [Model List](#model-list) 32 | - [Download Script](#download-script) 33 | - [Tips](#tips) 34 | - [Env Examples](#env-examples) 35 | - [Run on Nvidia GPU](#run-on-nvidia-gpu) 36 | - [Run bitsandbytes 8 bit](#run-bitsandbytes-8-bit) 37 | - [Run GPTQ 4 bit](#run-gptq-4-bit) 38 | - [Run on CPU](#run-on-cpu) 39 | - [Mac Metal Acceleration](#mac-metal-acceleration) 40 | - [AMD/Nvidia GPU Acceleration](#amdnvidia-gpu-acceleration) 41 | - [License](#license) 42 | - [Contributing](#contributing) 43 | 44 | 45 | 46 | ## Install 47 | ### Method 1: From [PyPI](https://pypi.org/project/llama2-wrapper/) 48 | ``` 49 | pip install llama2-wrapper 50 | ``` 51 | The newest `llama2-wrapper>=0.1.14` supports llama.cpp's `gguf` models. 52 | 53 | If you would like to use old `ggml` models, install `llama2-wrapper<=0.1.13` or manually install `llama-cpp-python==0.1.77`. 54 | 55 | ### Method 2: From Source: 56 | 57 | ``` 58 | git clone https://github.com/liltom-eth/llama2-webui.git 59 | cd llama2-webui 60 | pip install -r requirements.txt 61 | ``` 62 | ### Install Issues: 63 | `bitsandbytes >= 0.39` may not work on older NVIDIA GPUs. In that case, to use `LOAD_IN_8BIT`, you may have to downgrade like this: 64 | 65 | - `pip install bitsandbytes==0.38.1` 66 | 67 | `bitsandbytes` also need a special install for Windows: 68 | 69 | ``` 70 | pip uninstall bitsandbytes 71 | pip install https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.0-py3-none-win_amd64.whl 72 | ``` 73 | 74 | ## Usage 75 | 76 | ### Start Chat UI 77 | 78 | Run chatbot simply with web UI: 79 | 80 | ```bash 81 | python app.py 82 | ``` 83 | 84 | `app.py` will load the default config `.env` which uses `llama.cpp` as the backend to run `llama-2-7b-chat.ggmlv3.q4_0.bin` model for inference. The model `llama-2-7b-chat.ggmlv3.q4_0.bin` will be automatically downloaded. 85 | 86 | ```bash 87 | Running on backend llama.cpp. 88 | Use default model path: ./models/llama-2-7b-chat.Q4_0.gguf 89 | Start downloading model to: ./models/llama-2-7b-chat.Q4_0.gguf 90 | ``` 91 | 92 | You can also customize your `MODEL_PATH`, `BACKEND_TYPE,` and model configs in `.env` file to run different llama2 models on different backends (llama.cpp, transformers, gptq). 93 | 94 | ### Start Code Llama UI 95 | 96 | We provide a code completion / filling UI for Code Llama. 97 | 98 | Base model **Code Llama** and extend model **Code Llama — Python** are not fine-tuned to follow instructions. They should be prompted so that the expected answer is the natural continuation of the prompt. That means these two models focus on code filling and code completion. 99 | 100 | Here is an example run CodeLlama code completion on llama.cpp backend: 101 | 102 | ``` 103 | python code_completion.py --model_path ./models/codellama-7b.Q4_0.gguf 104 | ``` 105 | 106 | ![code_llama_playground](https://i.imgur.com/FgMUiT6.gif) 107 | 108 | `codellama-7b.Q4_0.gguf` can be downloaded from [TheBloke/CodeLlama-7B-GGUF](https://huggingface.co/TheBloke/CodeLlama-7B-GGUF/blob/main/codellama-7b.Q4_0.gguf). 109 | 110 | **Code Llama — Instruct** trained with “natural language instruction” inputs paired with anticipated outputs. This strategic methodology enhances the model’s capacity to grasp human expectations in prompts. That means instruct models can be used in a chatbot-like app. 111 | 112 | Example run CodeLlama chat on gptq backend: 113 | 114 | ``` 115 | python app.py --backend_type gptq --model_path ./models/CodeLlama-7B-Instruct-GPTQ/ --share True 116 | ``` 117 | 118 | ![code_llama_chat](https://i.imgur.com/lQLfemB.gif) 119 | 120 | `CodeLlama-7B-Instruct-GPTQ` can be downloaded from [TheBloke/CodeLlama-7B-Instruct-GPTQ](https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GPTQ) 121 | 122 | ### Use llama2-wrapper for Your App 123 | 124 | 🔥 For developers, we released `llama2-wrapper` as a llama2 backend wrapper in [PYPI](https://pypi.org/project/llama2-wrapper/). 125 | 126 | Use `llama2-wrapper` as your local llama2 backend to answer questions and more, [colab example](./colab/ggmlv3_q4_0.ipynb): 127 | 128 | ```python 129 | # pip install llama2-wrapper 130 | from llama2_wrapper import LLAMA2_WRAPPER, get_prompt 131 | llama2_wrapper = LLAMA2_WRAPPER() 132 | # Default running on backend llama.cpp. 133 | # Automatically downloading model to: ./models/llama-2-7b-chat.ggmlv3.q4_0.bin 134 | prompt = "Do you know Pytorch" 135 | answer = llama2_wrapper(get_prompt(prompt), temperature=0.9) 136 | ``` 137 | 138 | Run gptq llama2 model on Nvidia GPU, [colab example](./colab/Llama_2_7b_Chat_GPTQ.ipynb): 139 | 140 | ```python 141 | from llama2_wrapper import LLAMA2_WRAPPER 142 | llama2_wrapper = LLAMA2_WRAPPER(backend_type="gptq") 143 | # Automatically downloading model to: ./models/Llama-2-7b-Chat-GPTQ 144 | ``` 145 | 146 | Run llama2 7b with bitsandbytes 8 bit with a `model_path`: 147 | 148 | ```python 149 | from llama2_wrapper import LLAMA2_WRAPPER 150 | llama2_wrapper = LLAMA2_WRAPPER( 151 | model_path = "./models/Llama-2-7b-chat-hf", 152 | backend_type = "transformers", 153 | load_in_8bit = True 154 | ) 155 | ``` 156 | Check [API Document](https://pypi.org/project/llama2-wrapper/) for more usages. 157 | 158 | ### Start OpenAI Compatible API 159 | 160 | `llama2-wrapper` offers a web server that acts as a drop-in replacement for the OpenAI API. This allows you to use Llama2 models with any OpenAI compatible clients, libraries or services, etc. 161 | 162 | Start Fast API: 163 | 164 | ``` 165 | python -m llama2_wrapper.server 166 | ``` 167 | 168 | it will use `llama.cpp` as the backend by default to run `llama-2-7b-chat.ggmlv3.q4_0.bin` model. 169 | 170 | Start Fast API for `gptq` backend: 171 | 172 | ``` 173 | python -m llama2_wrapper.server --backend_type gptq 174 | ``` 175 | 176 | Navigate to http://localhost:8000/docs to see the OpenAPI documentation. 177 | 178 | #### Basic settings 179 | 180 | | Flag | Description | 181 | | ---------------- | ------------------------------------------------------------ | 182 | | `-h`, `--help` | Show this help message. | 183 | | `--model_path` | The path to the model to use for generating completions. | 184 | | `--backend_type` | Backend for llama2, options: llama.cpp, gptq, transformers | 185 | | `--max_tokens` | Maximum context size. | 186 | | `--load_in_8bit` | Whether to use bitsandbytes to run model in 8 bit mode (only for transformers models). | 187 | | `--verbose` | Whether to print verbose output to stderr. | 188 | | `--host` | API address | 189 | | `--port` | API port | 190 | 191 | ## Benchmark 192 | 193 | Run benchmark script to compute performance on your device, `benchmark.py` will load the same `.env` as `app.py`.: 194 | 195 | ```bash 196 | python benchmark.py 197 | ``` 198 | 199 | You can also select the `iter`, `backend_type` and `model_path` the benchmark will be run (overwrite .env args) : 200 | 201 | ```bash 202 | python benchmark.py --iter NB_OF_ITERATIONS --backend_type gptq 203 | ``` 204 | 205 | By default, the number of iterations is 5, but if you want a faster result or a more accurate one 206 | you can set it to whatever value you want, but please only report results with at least 5 iterations. 207 | 208 | This [colab example](./colab/Llama_2_7b_Chat_GPTQ.ipynb) also show you how to benchmark gptq model on free Google Colab T4 GPU. 209 | 210 | Some benchmark performance: 211 | 212 | | Model | Precision | Device | RAM / GPU VRAM | Speed (tokens/sec) | load time (s) | 213 | | --------------------------- | --------- | ------------------ | -------------- | ------------------ | ------------- | 214 | | Llama-2-7b-chat-hf | 8 bit | NVIDIA RTX 2080 Ti | 7.7 GB VRAM | 3.76 | 641.36 | 215 | | Llama-2-7b-Chat-GPTQ | 4 bit | NVIDIA RTX 2080 Ti | 5.8 GB VRAM | 18.85 | 192.91 | 216 | | Llama-2-7b-Chat-GPTQ | 4 bit | Google Colab T4 | 5.8 GB VRAM | 18.19 | 37.44 | 217 | | llama-2-7b-chat.ggmlv3.q4_0 | 4 bit | Apple M1 Pro CPU | 5.4 GB RAM | 17.90 | 0.18 | 218 | | llama-2-7b-chat.ggmlv3.q4_0 | 4 bit | Apple M2 CPU | 5.4 GB RAM | 13.70 | 0.13 | 219 | | llama-2-7b-chat.ggmlv3.q4_0 | 4 bit | Apple M2 Metal | 5.4 GB RAM | 12.60 | 0.10 | 220 | | llama-2-7b-chat.ggmlv3.q2_K | 2 bit | Intel i7-8700 | 4.5 GB RAM | 7.88 | 31.90 | 221 | 222 | Check/contribute the performance of your device in the full [performance doc](./docs/performance.md). 223 | 224 | ## Download Llama-2 Models 225 | 226 | Llama 2 is a collection of pre-trained and fine-tuned generative text models ranging in scale from 7 billion to 70 billion parameters. 227 | 228 | Llama-2-7b-Chat-GPTQ is the GPTQ model files for [Meta's Llama 2 7b Chat](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf). GPTQ 4-bit Llama-2 model require less GPU VRAM to run it. 229 | 230 | ### Model List 231 | 232 | | Model Name | set MODEL_PATH in .env | Download URL | 233 | | ----------------------------------- | ---------------------------------------- | ------------------------------------------------------------ | 234 | | meta-llama/Llama-2-7b-chat-hf | /path-to/Llama-2-7b-chat-hf | [Link](https://huggingface.co/llamaste/Llama-2-7b-chat-hf) | 235 | | meta-llama/Llama-2-13b-chat-hf | /path-to/Llama-2-13b-chat-hf | [Link](https://huggingface.co/llamaste/Llama-2-13b-chat-hf) | 236 | | meta-llama/Llama-2-70b-chat-hf | /path-to/Llama-2-70b-chat-hf | [Link](https://huggingface.co/llamaste/Llama-2-70b-chat-hf) | 237 | | meta-llama/Llama-2-7b-hf | /path-to/Llama-2-7b-hf | [Link](https://huggingface.co/meta-llama/Llama-2-7b-hf) | 238 | | meta-llama/Llama-2-13b-hf | /path-to/Llama-2-13b-hf | [Link](https://huggingface.co/meta-llama/Llama-2-13b-hf) | 239 | | meta-llama/Llama-2-70b-hf | /path-to/Llama-2-70b-hf | [Link](https://huggingface.co/meta-llama/Llama-2-70b-hf) | 240 | | TheBloke/Llama-2-7b-Chat-GPTQ | /path-to/Llama-2-7b-Chat-GPTQ | [Link](https://huggingface.co/TheBloke/Llama-2-7b-Chat-GPTQ) | 241 | | TheBloke/Llama-2-7b-Chat-GGUF | /path-to/llama-2-7b-chat.Q4_0.gguf | [Link](https://huggingface.co/TheBloke/Llama-2-7b-Chat-GGUF/blob/main/llama-2-7b-chat.Q4_0.gguf) | 242 | | TheBloke/Llama-2-7B-Chat-GGML | /path-to/llama-2-7b-chat.ggmlv3.q4_0.bin | [Link](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML) | 243 | | TheBloke/CodeLlama-7B-Instruct-GPTQ | TheBloke/CodeLlama-7B-Instruct-GPTQ | [Link](https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GPTQ) | 244 | | ... | ... | ... | 245 | 246 | Running 4-bit model `Llama-2-7b-Chat-GPTQ` needs GPU with 6GB VRAM. 247 | 248 | Running 4-bit model `llama-2-7b-chat.ggmlv3.q4_0.bin` needs CPU with 6GB RAM. There is also a list of other 2, 3, 4, 5, 6, 8-bit GGML models that can be used from [TheBloke/Llama-2-7B-Chat-GGML](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML). 249 | 250 | ### Download Script 251 | 252 | These models can be downloaded through: 253 | 254 | ```bash 255 | python -m llama2_wrapper.download --repo_id TheBloke/CodeLlama-7B-Python-GPTQ 256 | 257 | python -m llama2_wrapper.download --repo_id TheBloke/Llama-2-7b-Chat-GGUF --filename llama-2-7b-chat.Q4_0.gguf --save_dir ./models 258 | ``` 259 | 260 | Or use CMD like: 261 | 262 | ```bash 263 | # Make sure you have git-lfs installed (https://git-lfs.com) 264 | git lfs install 265 | git clone git@hf.co:meta-llama/Llama-2-7b-chat-hf 266 | ``` 267 | 268 | To download Llama 2 models, you need to request access from [https://ai.meta.com/llama/](https://ai.meta.com/llama/) and also enable access on repos like [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/tree/main). Requests will be processed in hours. 269 | 270 | For GPTQ models like [TheBloke/Llama-2-7b-Chat-GPTQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-GPTQ), you can directly download without requesting access. 271 | 272 | For GGML models like [TheBloke/Llama-2-7B-Chat-GGML](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML), you can directly download without requesting access. 273 | 274 | ## Tips 275 | 276 | ### Env Examples 277 | 278 | There are some examples in `./env_examples/` folder. 279 | 280 | | Model Setup | Example .env | 281 | | ------------------------------------------------------ | --------------------------- | 282 | | Llama-2-7b-chat-hf 8-bit (transformers backend) | .env.7b_8bit_example | 283 | | Llama-2-7b-Chat-GPTQ 4-bit (gptq transformers backend) | .env.7b_gptq_example | 284 | | Llama-2-7B-Chat-GGML 4bit (llama.cpp backend) | .env.7b_ggmlv3_q4_0_example | 285 | | Llama-2-13b-chat-hf (transformers backend) | .env.13b_example | 286 | | ... | ... | 287 | 288 | ### Run on Nvidia GPU 289 | 290 | The running requires around 14GB of GPU VRAM for Llama-2-7b and 28GB of GPU VRAM for Llama-2-13b. 291 | 292 | If you are running on multiple GPUs, the model will be loaded automatically on GPUs and split the VRAM usage. That allows you to run Llama-2-7b (requires 14GB of GPU VRAM) on a setup like 2 GPUs (11GB VRAM each). 293 | 294 | #### Run bitsandbytes 8 bit 295 | 296 | If you do not have enough memory, you can set up your `LOAD_IN_8BIT` as `True` in `.env`. This can reduce memory usage by around half with slightly degraded model quality. It is compatible with the CPU, GPU, and Metal backend. 297 | 298 | Llama-2-7b with 8-bit compression can run on a single GPU with 8 GB of VRAM, like an Nvidia RTX 2080Ti, RTX 4080, T4, V100 (16GB). 299 | 300 | #### Run GPTQ 4 bit 301 | 302 | If you want to run 4 bit Llama-2 model like `Llama-2-7b-Chat-GPTQ`, you can set up your `BACKEND_TYPE` as `gptq` in `.env` like example `.env.7b_gptq_example`. 303 | 304 | Make sure you have downloaded the 4-bit model from `Llama-2-7b-Chat-GPTQ` and set the `MODEL_PATH` and arguments in `.env` file. 305 | 306 | `Llama-2-7b-Chat-GPTQ` can run on a single GPU with 6 GB of VRAM. 307 | 308 | If you encounter issue like `NameError: name 'autogptq_cuda_256' is not defined`, please refer to [here](https://huggingface.co/TheBloke/open-llama-13b-open-instruct-GPTQ/discussions/1) 309 | > pip install https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.3.0/auto_gptq-0.3.0+cu117-cp310-cp310-linux_x86_64.whl 310 | 311 | ### Run on CPU 312 | 313 | Run Llama-2 model on CPU requires [llama.cpp](https://github.com/ggerganov/llama.cpp) dependency and [llama.cpp Python Bindings](https://github.com/abetlen/llama-cpp-python), which are already installed. 314 | 315 | 316 | Download GGML models like `llama-2-7b-chat.ggmlv3.q4_0.bin` following [Download Llama-2 Models](#download-llama-2-models) section. `llama-2-7b-chat.ggmlv3.q4_0.bin` model requires at least 6 GB RAM to run on CPU. 317 | 318 | Set up configs like `.env.7b_ggmlv3_q4_0_example` from `env_examples` as `.env`. 319 | 320 | Run web UI `python app.py` . 321 | 322 | #### Mac Metal Acceleration 323 | 324 | For Mac users, you can also set up Mac Metal for acceleration, try install this dependencies: 325 | 326 | ```bash 327 | pip uninstall llama-cpp-python -y 328 | CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install -U llama-cpp-python --no-cache-dir 329 | pip install 'llama-cpp-python[server]' 330 | ``` 331 | 332 | or check details: 333 | 334 | - [MacOS Install with Metal GPU](https://github.com/abetlen/llama-cpp-python/blob/main/docs/install/macos.md) 335 | 336 | #### AMD/Nvidia GPU Acceleration 337 | 338 | If you would like to use AMD/Nvidia GPU for acceleration, check this: 339 | 340 | - [Installation with OpenBLAS / cuBLAS / CLBlast / Metal](https://github.com/abetlen/llama-cpp-python#installation-with-openblas--cublas--clblast--metal) 341 | 342 | 343 | 344 | 345 | 346 | ## License 347 | 348 | MIT - see [MIT License](LICENSE) 349 | 350 | This project enables users to adapt it freely for proprietary purposes without any restrictions. 351 | 352 | ## Contributing 353 | 354 | Kindly read our [Contributing Guide](CONTRIBUTING.md) to learn and understand our development process. 355 | 356 | ### All Contributors 357 | 358 | 359 | 360 | 361 | 362 | ### Review 363 | Github 364 | 365 | ### Star History 366 | 367 | [![Star History Chart](https://api.star-history.com/svg?repos=liltom-eth/llama2-webui&type=Date)](https://star-history.com/#liltom-eth/llama2-webui&Date) 368 | 369 | ## Credits 370 | 371 | - https://huggingface.co/meta-llama/Llama-2-7b-chat-hf 372 | - https://huggingface.co/spaces/huggingface-projects/llama-2-7b-chat 373 | - https://huggingface.co/TheBloke/Llama-2-7b-Chat-GPTQ 374 | - [https://github.com/ggerganov/llama.cpp](https://github.com/ggerganov/llama.cpp) 375 | - [https://github.com/TimDettmers/bitsandbytes](https://github.com/TimDettmers/bitsandbytes) 376 | - [https://github.com/PanQiWei/AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) 377 | - [https://github.com/abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python) 378 | -------------------------------------------------------------------------------- /colab/webui_CodeLlama_7B_Instruct_GPTQ.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [], 7 | "gpuType": "T4", 8 | "authorship_tag": "ABX9TyOZhPcZe61RhDjhEFQv0vrl", 9 | "include_colab_link": true 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | }, 15 | "language_info": { 16 | "name": "python" 17 | }, 18 | "accelerator": "GPU" 19 | }, 20 | "cells": [ 21 | { 22 | "cell_type": "markdown", 23 | "metadata": { 24 | "id": "view-in-github", 25 | "colab_type": "text" 26 | }, 27 | "source": [ 28 | "\"Open" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": { 35 | "id": "7O5JSosg5-rx" 36 | }, 37 | "outputs": [], 38 | "source": [ 39 | "!pip install -U llama2-wrapper==0.1.12" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "source": [ 45 | "%cd /content\n", 46 | "!git clone https://github.com/liltom-eth/llama2-webui\n", 47 | "\n", 48 | "%cd /content/llama2-webui\n", 49 | "!python -m llama2_wrapper.download --repo_id TheBloke/CodeLlama-7B-Instruct-GPTQ\n", 50 | "\n", 51 | "%cd /content/llama2-webui\n", 52 | "!python app.py --backend_type gptq --model_path ./models/CodeLlama-7B-Instruct-GPTQ/ --share True" 53 | ], 54 | "metadata": { 55 | "colab": { 56 | "base_uri": "https://localhost:8080/" 57 | }, 58 | "id": "Y6A7bJdkmzY8", 59 | "outputId": "0d702a7d-68ab-4747-f012-246d4dee3718" 60 | }, 61 | "execution_count": 4, 62 | "outputs": [ 63 | { 64 | "output_type": "stream", 65 | "name": "stdout", 66 | "text": [ 67 | "/content\n", 68 | "fatal: destination path 'llama2-webui' already exists and is not an empty directory.\n", 69 | "/content/llama2-webui\n", 70 | "Start downloading model TheBloke/CodeLlama-7B-Instruct-GPTQ to: ./models/CodeLlama-7B-Instruct-GPTQ\n", 71 | "Fetching 15 files: 0% 0/15 [00:00\n", 499 | " main()\n", 500 | " File \"/content/llama2-webui/app.py\", line 318, in main\n", 501 | " demo.queue(max_size=20).launch(share=args.share)\n", 502 | " File \"/usr/local/lib/python3.10/dist-packages/gradio/blocks.py\", line 2046, in launch\n", 503 | " self.block_thread()\n", 504 | " File \"/usr/local/lib/python3.10/dist-packages/gradio/blocks.py\", line 2132, in block_thread\n", 505 | " print(\"Keyboard interruption in main thread... closing server.\")\n", 506 | "KeyboardInterrupt\n", 507 | "Killing tunnel 127.0.0.1:7860 <> https://71c3606942c440e7dd.gradio.live\n", 508 | "terminate called without an active exception\n" 509 | ] 510 | } 511 | ] 512 | } 513 | ] 514 | } -------------------------------------------------------------------------------- /llama2_wrapper/model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import uuid 4 | from enum import Enum 5 | from threading import Thread 6 | from typing import Any, Iterator, Union, List 7 | from llama2_wrapper.types import ( 8 | Completion, 9 | CompletionChunk, 10 | ChatCompletion, 11 | ChatCompletionChunk, 12 | # ChatCompletionMessage, 13 | Message, 14 | B_INST, 15 | E_INST, 16 | B_SYS, 17 | E_SYS, 18 | ) 19 | 20 | 21 | class LLAMA2_WRAPPER: 22 | def __init__( 23 | self, 24 | model_path: str = "", 25 | backend_type: str = "llama.cpp", 26 | max_tokens: int = 4000, 27 | load_in_8bit: bool = True, 28 | verbose: bool = False, 29 | ): 30 | """Load a llama2 model from `model_path`. 31 | 32 | Args: 33 | model_path: Path to the model. 34 | backend_type: Backend for llama2, options: llama.cpp, gptq, transformers 35 | max_tokens: Maximum context size. 36 | load_in_8bit: Use bitsandbytes to run model in 8 bit mode (only for transformers models). 37 | verbose: Print verbose output to stderr. 38 | 39 | Raises: 40 | ValueError: If the model path does not exist. 41 | 42 | Returns: 43 | A LLAMA2_WRAPPER instance. 44 | """ 45 | self.model_path = model_path 46 | self.backend_type = BackendType.get_type(backend_type) 47 | self.max_tokens = max_tokens 48 | self.load_in_8bit = load_in_8bit 49 | 50 | self.model = None 51 | self.tokenizer = None 52 | 53 | self.verbose = verbose 54 | 55 | if self.backend_type is BackendType.LLAMA_CPP: 56 | print("Running on backend llama.cpp.") 57 | else: 58 | import torch 59 | 60 | if torch.cuda.is_available(): 61 | print("Running on GPU with backend torch transformers.") 62 | else: 63 | print("GPU CUDA not found.") 64 | 65 | self.default_llamacpp_path = "./models/llama-2-7b-chat.Q4_0.gguf" 66 | self.default_gptq_path = "./models/Llama-2-7b-Chat-GPTQ" 67 | # Download default ggml/gptq model 68 | if self.model_path == "": 69 | print("Model path is empty.") 70 | if self.backend_type is BackendType.LLAMA_CPP: 71 | print("Use default llama.cpp model path: " + self.default_llamacpp_path) 72 | if not os.path.exists(self.default_llamacpp_path): 73 | print("Start downloading model to: " + self.default_llamacpp_path) 74 | from huggingface_hub import hf_hub_download 75 | 76 | hf_hub_download( 77 | repo_id="TheBloke/Llama-2-7b-Chat-GGUF", 78 | filename="llama-2-7b-chat.Q4_0.gguf", 79 | local_dir="./models/", 80 | ) 81 | else: 82 | print("Model exists in ./models/llama-2-7b-chat.Q4_0.gguf.") 83 | self.model_path = self.default_llamacpp_path 84 | elif self.backend_type is BackendType.GPTQ: 85 | print("Use default gptq model path: " + self.default_gptq_path) 86 | if not os.path.exists(self.default_gptq_path): 87 | print("Start downloading model to: " + self.default_gptq_path) 88 | from huggingface_hub import snapshot_download 89 | 90 | snapshot_download( 91 | "TheBloke/Llama-2-7b-Chat-GPTQ", 92 | local_dir=self.default_gptq_path, 93 | ) 94 | else: 95 | print("Model exists in " + self.default_gptq_path) 96 | self.model_path = self.default_gptq_path 97 | 98 | self.init_tokenizer() 99 | self.init_model() 100 | 101 | def init_model(self): 102 | if self.model is None: 103 | self.model = LLAMA2_WRAPPER.create_llama2_model( 104 | self.model_path, 105 | self.backend_type, 106 | self.max_tokens, 107 | self.load_in_8bit, 108 | self.verbose, 109 | ) 110 | if self.backend_type is not BackendType.LLAMA_CPP: 111 | self.model.eval() 112 | 113 | def init_tokenizer(self): 114 | if self.backend_type is not BackendType.LLAMA_CPP: 115 | if self.tokenizer is None: 116 | self.tokenizer = LLAMA2_WRAPPER.create_llama2_tokenizer(self.model_path) 117 | 118 | @classmethod 119 | def create_llama2_model( 120 | cls, model_path, backend_type, max_tokens, load_in_8bit, verbose 121 | ): 122 | if backend_type is BackendType.LLAMA_CPP: 123 | from llama_cpp import Llama 124 | 125 | model = Llama( 126 | model_path=model_path, 127 | n_ctx=max_tokens, 128 | n_batch=max_tokens, 129 | verbose=verbose, 130 | ) 131 | elif backend_type is BackendType.GPTQ: 132 | from auto_gptq import AutoGPTQForCausalLM 133 | 134 | model = AutoGPTQForCausalLM.from_quantized( 135 | model_path, 136 | use_safetensors=True, 137 | trust_remote_code=True, 138 | device="cuda:0", 139 | use_triton=False, 140 | quantize_config=None, 141 | ) 142 | elif backend_type is BackendType.TRANSFORMERS: 143 | import torch 144 | from transformers import AutoModelForCausalLM 145 | 146 | model = AutoModelForCausalLM.from_pretrained( 147 | model_path, 148 | device_map="auto", 149 | torch_dtype=torch.float16, 150 | load_in_8bit=load_in_8bit, 151 | ) 152 | else: 153 | print(backend_type + "not implemented.") 154 | return model 155 | 156 | @classmethod 157 | def create_llama2_tokenizer(cls, model_path): 158 | from transformers import AutoTokenizer 159 | 160 | tokenizer = AutoTokenizer.from_pretrained(model_path) 161 | return tokenizer 162 | 163 | def get_token_length( 164 | self, 165 | prompt: str, 166 | ) -> int: 167 | if self.backend_type is BackendType.LLAMA_CPP: 168 | input_ids = self.model.tokenize(bytes(prompt, "utf-8")) 169 | return len(input_ids) 170 | else: 171 | input_ids = self.tokenizer([prompt], return_tensors="np")["input_ids"] 172 | return input_ids.shape[-1] 173 | 174 | def get_input_token_length( 175 | self, 176 | message: str, 177 | chat_history: list[tuple[str, str]] = [], 178 | system_prompt: str = "", 179 | ) -> int: 180 | prompt = get_prompt(message, chat_history, system_prompt) 181 | 182 | return self.get_token_length(prompt) 183 | 184 | def generate( 185 | self, 186 | prompt: str, 187 | max_new_tokens: int = 1000, 188 | temperature: float = 0.9, 189 | top_p: float = 1.0, 190 | top_k: int = 40, 191 | repetition_penalty: float = 1.0, 192 | **kwargs: Any, 193 | ) -> Iterator[str]: 194 | """Create a generator of response from a prompt. 195 | 196 | Examples: 197 | >>> llama2_wrapper = LLAMA2_WRAPPER() 198 | >>> prompt = get_prompt("Hi do you know Pytorch?") 199 | >>> for response in llama2_wrapper.generate(prompt): 200 | ... print(response) 201 | 202 | Args: 203 | prompt: The prompt to generate text from. 204 | max_new_tokens: The maximum number of tokens to generate. 205 | temperature: The temperature to use for sampling. 206 | top_p: The top-p value to use for sampling. 207 | top_k: The top-k value to use for sampling. 208 | repetition_penalty: The penalty to apply to repeated tokens. 209 | kwargs: all other arguments. 210 | 211 | Yields: 212 | The generated text. 213 | """ 214 | if self.backend_type is BackendType.LLAMA_CPP: 215 | result = self.model( 216 | prompt=prompt, 217 | stream=True, 218 | max_tokens=max_new_tokens, 219 | top_k=top_k, 220 | top_p=top_p, 221 | temperature=temperature, 222 | repeat_penalty=repetition_penalty, 223 | **kwargs, 224 | ) 225 | outputs = [] 226 | for part in result: 227 | text = part["choices"][0]["text"] 228 | outputs.append(text) 229 | yield "".join(outputs) 230 | else: 231 | from transformers import TextIteratorStreamer 232 | 233 | inputs = self.tokenizer([prompt], return_tensors="pt").to("cuda") 234 | 235 | streamer = TextIteratorStreamer( 236 | self.tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True 237 | ) 238 | generate_kwargs = dict( 239 | inputs, 240 | streamer=streamer, 241 | max_new_tokens=max_new_tokens, 242 | temperature=temperature, 243 | top_p=top_p, 244 | top_k=top_k, 245 | repetition_penalty=repetition_penalty, 246 | # num_beams=1, 247 | ) 248 | generate_kwargs = ( 249 | generate_kwargs if kwargs is None else {**generate_kwargs, **kwargs} 250 | ) 251 | t = Thread(target=self.model.generate, kwargs=generate_kwargs) 252 | t.start() 253 | 254 | outputs = [] 255 | for text in streamer: 256 | outputs.append(text) 257 | yield "".join(outputs) 258 | 259 | def run( 260 | self, 261 | message: str, 262 | chat_history: list[tuple[str, str]] = [], 263 | system_prompt: str = "", 264 | max_new_tokens: int = 1000, 265 | temperature: float = 0.9, 266 | top_p: float = 1.0, 267 | top_k: int = 40, 268 | repetition_penalty: float = 1.0, 269 | ) -> Iterator[str]: 270 | """Create a generator of response from a chat message. 271 | Process message to llama2 prompt with chat history 272 | and system_prompt for chatbot. 273 | 274 | Args: 275 | message: The origianl chat message to generate text from. 276 | chat_history: Chat history list from chatbot. 277 | system_prompt: System prompt for chatbot. 278 | max_new_tokens: The maximum number of tokens to generate. 279 | temperature: The temperature to use for sampling. 280 | top_p: The top-p value to use for sampling. 281 | top_k: The top-k value to use for sampling. 282 | repetition_penalty: The penalty to apply to repeated tokens. 283 | kwargs: all other arguments. 284 | 285 | Yields: 286 | The generated text. 287 | """ 288 | prompt = get_prompt(message, chat_history, system_prompt) 289 | return self.generate( 290 | prompt, max_new_tokens, temperature, top_p, top_k, repetition_penalty 291 | ) 292 | 293 | def __call__( 294 | self, 295 | prompt: str, 296 | stream: bool = False, 297 | max_new_tokens: int = 1000, 298 | temperature: float = 0.9, 299 | top_p: float = 1.0, 300 | top_k: int = 40, 301 | repetition_penalty: float = 1.0, 302 | **kwargs: Any, 303 | ) -> Union[str, Iterator[str]]: 304 | """Generate text from a prompt. 305 | 306 | Examples: 307 | >>> llama2_wrapper = LLAMA2_WRAPPER() 308 | >>> prompt = get_prompt("Hi do you know Pytorch?") 309 | >>> print(llama2_wrapper(prompt)) 310 | 311 | Args: 312 | prompt: The prompt to generate text from. 313 | stream: Whether to stream the results. 314 | max_new_tokens: The maximum number of tokens to generate. 315 | temperature: The temperature to use for sampling. 316 | top_p: The top-p value to use for sampling. 317 | top_k: The top-k value to use for sampling. 318 | repetition_penalty: The penalty to apply to repeated tokens. 319 | kwargs: all other arguments. 320 | 321 | Raises: 322 | ValueError: If the requested tokens exceed the context window. 323 | RuntimeError: If the prompt fails to tokenize or the model fails to evaluate the prompt. 324 | 325 | Returns: 326 | Generated text. 327 | """ 328 | if self.backend_type is BackendType.LLAMA_CPP: 329 | completion_or_chunks = self.model.__call__( 330 | prompt, 331 | stream=stream, 332 | max_tokens=max_new_tokens, 333 | temperature=temperature, 334 | top_p=top_p, 335 | top_k=top_k, 336 | repeat_penalty=repetition_penalty, 337 | **kwargs, 338 | ) 339 | if stream: 340 | 341 | def chunk_generator(chunks): 342 | for part in chunks: 343 | chunk = part["choices"][0]["text"] 344 | yield chunk 345 | 346 | chunks: Iterator[str] = chunk_generator(completion_or_chunks) 347 | return chunks 348 | return completion_or_chunks["choices"][0]["text"] 349 | else: 350 | inputs = self.tokenizer([prompt], return_tensors="pt").input_ids 351 | prompt_tokens_len = len(inputs[0]) 352 | inputs = inputs.to("cuda") 353 | generate_kwargs = dict( 354 | inputs=inputs, 355 | max_new_tokens=max_new_tokens, 356 | temperature=temperature, 357 | top_p=top_p, 358 | top_k=top_k, 359 | repetition_penalty=repetition_penalty, 360 | # num_beams=1, 361 | ) 362 | generate_kwargs = ( 363 | generate_kwargs if kwargs is None else {**generate_kwargs, **kwargs} 364 | ) 365 | if stream: 366 | from transformers import TextIteratorStreamer 367 | 368 | streamer = TextIteratorStreamer( 369 | self.tokenizer, 370 | timeout=10.0, 371 | skip_prompt=True, 372 | skip_special_tokens=True, 373 | ) 374 | generate_kwargs["streamer"] = streamer 375 | 376 | t = Thread(target=self.model.generate, kwargs=generate_kwargs) 377 | t.start() 378 | return streamer 379 | else: 380 | output_ids = self.model.generate( 381 | **generate_kwargs, 382 | ) 383 | # skip prompt, skip special tokens 384 | output = self.tokenizer.decode( 385 | output_ids[0][prompt_tokens_len:], skip_special_tokens=True 386 | ) 387 | return output 388 | 389 | def completion( 390 | self, 391 | prompt: str, 392 | stream: bool = False, 393 | max_new_tokens: int = 1000, 394 | temperature: float = 0.9, 395 | top_p: float = 1.0, 396 | top_k: int = 40, 397 | repetition_penalty: float = 1.0, 398 | **kwargs: Any, 399 | ) -> Union[Completion, Iterator[CompletionChunk]]: 400 | """For OpenAI compatible API /v1/completions 401 | Generate text from a prompt. 402 | 403 | Examples: 404 | >>> llama2_wrapper = LLAMA2_WRAPPER() 405 | >>> prompt = get_prompt("Hi do you know Pytorch?") 406 | >>> print(llm.completion(prompt)) 407 | 408 | Args: 409 | prompt: The prompt to generate text from. 410 | stream: Whether to stream the results. 411 | max_new_tokens: The maximum number of tokens to generate. 412 | temperature: The temperature to use for sampling. 413 | top_p: The top-p value to use for sampling. 414 | top_k: The top-k value to use for sampling. 415 | repetition_penalty: The penalty to apply to repeated tokens. 416 | kwargs: all other arguments. 417 | 418 | Raises: 419 | ValueError: If the requested tokens exceed the context window. 420 | RuntimeError: If the prompt fails to tokenize or the model fails to evaluate the prompt. 421 | 422 | Returns: 423 | Response object containing the generated text. 424 | """ 425 | completion_id: str = f"cmpl-{str(uuid.uuid4())}" 426 | created: int = int(time.time()) 427 | model_name: str = ( 428 | self.backend_type + " default model" 429 | if self.model_path == "" 430 | else self.model_path 431 | ) 432 | if self.backend_type is BackendType.LLAMA_CPP: 433 | completion_or_chunks = self.model.__call__( 434 | prompt, 435 | stream=stream, 436 | max_tokens=max_new_tokens, 437 | temperature=temperature, 438 | top_p=top_p, 439 | top_k=top_k, 440 | repeat_penalty=repetition_penalty, 441 | **kwargs, 442 | ) 443 | if stream: 444 | chunks: Iterator[CompletionChunk] = completion_or_chunks 445 | return chunks 446 | return completion_or_chunks 447 | else: 448 | inputs = self.tokenizer([prompt], return_tensors="pt").input_ids 449 | prompt_tokens_len = len(inputs[0]) 450 | inputs = inputs.to("cuda") 451 | generate_kwargs = dict( 452 | inputs=inputs, 453 | max_new_tokens=max_new_tokens, 454 | temperature=temperature, 455 | top_p=top_p, 456 | top_k=top_k, 457 | repetition_penalty=repetition_penalty, 458 | # num_beams=1, 459 | ) 460 | generate_kwargs = ( 461 | generate_kwargs if kwargs is None else {**generate_kwargs, **kwargs} 462 | ) 463 | if stream: 464 | from transformers import TextIteratorStreamer 465 | 466 | streamer = TextIteratorStreamer( 467 | self.tokenizer, 468 | timeout=10.0, 469 | skip_prompt=True, 470 | skip_special_tokens=True, 471 | ) 472 | generate_kwargs["streamer"] = streamer 473 | 474 | t = Thread(target=self.model.generate, kwargs=generate_kwargs) 475 | t.start() 476 | 477 | def chunk_generator(chunks): 478 | for part in chunks: 479 | yield { 480 | "id": completion_id, 481 | "object": "text_completion", 482 | "created": created, 483 | "model": model_name, 484 | "choices": [ 485 | { 486 | "text": part, 487 | "index": 0, 488 | "logprobs": None, 489 | "finish_reason": None, 490 | } 491 | ], 492 | } 493 | 494 | chunks: Iterator[CompletionChunk] = chunk_generator(streamer) 495 | return chunks 496 | 497 | else: 498 | output_ids = self.model.generate( 499 | **generate_kwargs, 500 | ) 501 | total_tokens_len = len(output_ids[0]) 502 | output = self.tokenizer.decode( 503 | output_ids[0][prompt_tokens_len:], skip_special_tokens=True 504 | ) 505 | completion: Completion = { 506 | "id": completion_id, 507 | "object": "text_completion", 508 | "created": created, 509 | "model": model_name, 510 | "choices": [ 511 | { 512 | "text": output, 513 | "index": 0, 514 | "logprobs": None, 515 | "finish_reason": None, 516 | } 517 | ], 518 | "usage": { 519 | "prompt_tokens": prompt_tokens_len, 520 | "completion_tokens": total_tokens_len - prompt_tokens_len, 521 | "total_tokens": total_tokens_len, 522 | }, 523 | } 524 | return completion 525 | 526 | def chat_completion( 527 | self, 528 | messages: List[Message], 529 | stream: bool = False, 530 | max_new_tokens: int = 1000, 531 | temperature: float = 0.9, 532 | top_p: float = 1.0, 533 | top_k: int = 40, 534 | repetition_penalty: float = 1.0, 535 | **kwargs: Any, 536 | ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]: 537 | """For OpenAI compatible API /v1/chat/completions 538 | Generate text from a dialog (chat history). 539 | 540 | Examples: 541 | >>> llama2_wrapper = LLAMA2_WRAPPER() 542 | >>> dialog = [ 543 | { 544 | "role":"system", 545 | "content":"You are a helpful, respectful and honest assistant. " 546 | },{ 547 | "role":"user", 548 | "content":"Hi do you know Pytorch?", 549 | }, 550 | ] 551 | >>> print(llm.chat_completion(dialog)) 552 | 553 | Args: 554 | dialog: The dialog (chat history) to generate text from. 555 | stream: Whether to stream the results. 556 | max_new_tokens: The maximum number of tokens to generate. 557 | temperature: The temperature to use for sampling. 558 | top_p: The top-p value to use for sampling. 559 | top_k: The top-k value to use for sampling. 560 | repetition_penalty: The penalty to apply to repeated tokens. 561 | kwargs: all other arguments. 562 | 563 | Raises: 564 | ValueError: If the requested tokens exceed the context window. 565 | RuntimeError: If the prompt fails to tokenize or the model fails to evaluate the prompt. 566 | 567 | Returns: 568 | Response object containing the generated text. 569 | """ 570 | completion_id: str = f"cmpl-{str(uuid.uuid4())}" 571 | created: int = int(time.time()) 572 | model_name: str = ( 573 | self.backend_type + " default model" 574 | if self.model_path == "" 575 | else self.model_path 576 | ) 577 | if self.backend_type is BackendType.LLAMA_CPP: 578 | completion_or_chunks = self.model.create_chat_completion( 579 | messages, 580 | stream=stream, 581 | max_tokens=max_new_tokens, 582 | temperature=temperature, 583 | top_p=top_p, 584 | top_k=top_k, 585 | repeat_penalty=repetition_penalty, 586 | **kwargs, 587 | ) 588 | if stream: 589 | chunks: Iterator[ChatCompletionChunk] = completion_or_chunks 590 | return chunks 591 | return completion_or_chunks 592 | else: 593 | prompt = get_prompt_for_dialog(messages) 594 | inputs = self.tokenizer([prompt], return_tensors="pt").input_ids 595 | prompt_tokens_len = len(inputs[0]) 596 | inputs = inputs.to("cuda") 597 | generate_kwargs = dict( 598 | inputs=inputs, 599 | max_new_tokens=max_new_tokens, 600 | temperature=temperature, 601 | top_p=top_p, 602 | top_k=top_k, 603 | repetition_penalty=repetition_penalty, 604 | # num_beams=1, 605 | ) 606 | generate_kwargs = ( 607 | generate_kwargs if kwargs is None else {**generate_kwargs, **kwargs} 608 | ) 609 | if stream: 610 | from transformers import TextIteratorStreamer 611 | 612 | streamer = TextIteratorStreamer( 613 | self.tokenizer, 614 | timeout=10.0, 615 | skip_prompt=True, 616 | skip_special_tokens=True, 617 | ) 618 | generate_kwargs["streamer"] = streamer 619 | t = Thread(target=self.model.generate, kwargs=generate_kwargs) 620 | t.start() 621 | 622 | def chunk_generator(chunks): 623 | yield { 624 | "id": "chat" + completion_id, 625 | "model": model_name, 626 | "created": created, 627 | "object": "chat.completion.chunk", 628 | "choices": [ 629 | { 630 | "index": 0, 631 | "delta": { 632 | "role": "assistant", 633 | }, 634 | "finish_reason": None, 635 | } 636 | ], 637 | } 638 | for part in enumerate(chunks): 639 | yield { 640 | "id": "chat" + completion_id, 641 | "model": model_name, 642 | "created": created, 643 | "object": "chat.completion.chunk", 644 | "choices": [ 645 | { 646 | "index": 0, 647 | "delta": { 648 | "content": part, 649 | }, 650 | "finish_reason": None, 651 | } 652 | ], 653 | } 654 | 655 | chunks: Iterator[ChatCompletionChunk] = chunk_generator(streamer) 656 | return chunks 657 | 658 | else: 659 | output_ids = self.model.generate( 660 | **generate_kwargs, 661 | ) 662 | total_tokens_len = len(output_ids[0]) 663 | output = self.tokenizer.decode( 664 | output_ids[0][prompt_tokens_len:], skip_special_tokens=True 665 | ) 666 | chatcompletion: ChatCompletion = { 667 | "id": "chat" + completion_id, 668 | "object": "chat.completion", 669 | "created": created, 670 | "model": model_name, 671 | "choices": [ 672 | { 673 | "index": 0, 674 | "message": { 675 | "role": "assistant", 676 | "content": output, 677 | }, 678 | "finish_reason": None, 679 | } 680 | ], 681 | "usage": { 682 | "prompt_tokens": prompt_tokens_len, 683 | "completion_tokens": total_tokens_len - prompt_tokens_len, 684 | "total_tokens": total_tokens_len, 685 | }, 686 | } 687 | return chatcompletion 688 | 689 | 690 | def get_prompt_for_dialog(dialog: List[Message]) -> str: 691 | """Process dialog (chat history) to llama2 prompt for 692 | OpenAI compatible API /v1/chat/completions. 693 | 694 | Examples: 695 | >>> dialog = [ 696 | { 697 | "role":"system", 698 | "content":"You are a helpful, respectful and honest assistant. " 699 | },{ 700 | "role":"user", 701 | "content":"Hi do you know Pytorch?", 702 | }, 703 | ] 704 | >>> prompt = get_prompt_for_dialog("Hi do you know Pytorch?") 705 | 706 | Args: 707 | dialog: The dialog (chat history) to generate text from. 708 | 709 | Yields: 710 | prompt string. 711 | """ 712 | # add "<>\n{system_prompt}\n<>\n\n" in first dialog 713 | if dialog[0]["role"] == "system": 714 | dialog = [ 715 | { 716 | "role": dialog[1]["role"], 717 | "content": B_SYS + dialog[0]["content"] + E_SYS + dialog[1]["content"], 718 | } 719 | ] + dialog[2:] 720 | # check roles 721 | assert all([msg["role"] == "user" for msg in dialog[::2]]) and all( 722 | [msg["role"] == "assistant" for msg in dialog[1::2]] 723 | ), ( 724 | "model only supports 'system', 'user' and 'assistant' roles, " 725 | "starting with 'system', then 'user' and alternating (u/a/u/a/u...)" 726 | ) 727 | # add chat history 728 | texts = [] 729 | for prompt, answer in zip( 730 | dialog[::2], 731 | dialog[1::2], 732 | ): 733 | texts.append( 734 | f"{B_INST} {(prompt['content']).strip()} {E_INST} {(answer['content']).strip()} " 735 | ) 736 | # check last message if role is user, then add it to prompt text 737 | assert ( 738 | dialog[-1]["role"] == "user" 739 | ), f"Last message must be from user, got {dialog[-1]['role']}" 740 | texts.append(f"{B_INST} {(dialog[-1]['content']).strip()} {E_INST}") 741 | return "".join(texts) 742 | 743 | 744 | def get_prompt( 745 | message: str, chat_history: list[tuple[str, str]] = [], system_prompt: str = "" 746 | ) -> str: 747 | """Process message to llama2 prompt with chat history 748 | and system_prompt for chatbot. 749 | 750 | Examples: 751 | >>> prompt = get_prompt("Hi do you know Pytorch?") 752 | 753 | Args: 754 | message: The origianl chat message to generate text from. 755 | chat_history: Chat history list from chatbot. 756 | system_prompt: System prompt for chatbot. 757 | 758 | Yields: 759 | prompt string. 760 | """ 761 | texts = [f"[INST] <>\n{system_prompt}\n<>\n\n"] 762 | for user_input, response in chat_history: 763 | texts.append(f"{user_input.strip()} [/INST] {response.strip()} [INST] ") 764 | texts.append(f"{message.strip()} [/INST]") 765 | return "".join(texts) 766 | 767 | 768 | class BackendType(Enum): 769 | UNKNOWN = 0 770 | TRANSFORMERS = 1 771 | GPTQ = 2 772 | LLAMA_CPP = 3 773 | 774 | @classmethod 775 | def get_type(cls, backend_name: str): 776 | backend_type = None 777 | backend_name_lower = backend_name.lower() 778 | if "transformers" in backend_name_lower: 779 | backend_type = BackendType.TRANSFORMERS 780 | elif "gptq" in backend_name_lower: 781 | backend_type = BackendType.GPTQ 782 | elif "cpp" in backend_name_lower: 783 | backend_type = BackendType.LLAMA_CPP 784 | else: 785 | raise Exception("Unknown backend: " + backend_name) 786 | # backend_type = BackendType.UNKNOWN 787 | return backend_type 788 | --------------------------------------------------------------------------------