├── docs
    ├── issues.md
    ├── news.md
    ├── performance.md
    └── pypi.md
├── tests
    ├── __init__.py
    └── test_get_prompt.py
├── llama2_wrapper
    ├── server
    │   ├── __init__.py
    │   ├── __main__.py
    │   └── app.py
    ├── download
    │   ├── __init__.py
    │   └── __main__.py
    ├── __init__.py
    ├── types.py
    └── model.py
├── static
    └── screenshot.png
├── .gitignore
├── requirements.txt
├── .env
├── env_examples
    ├── .env.13b_example
    ├── .env.7b_8bit_example
    ├── .env.7b_ggmlv3_q4_0_example
    └── .env.7b_gptq_example
├── .github
    └── workflows
    │   ├── release.yml
    │   └── branch.yml
├── LICENSE
├── pyproject.toml
├── prompts
    └── utils.py
├── CONTRIBUTING.md
├── colab
    ├── ggmlv3_q4_0.ipynb
    └── webui_CodeLlama_7B_Instruct_GPTQ.ipynb
├── benchmark.py
├── code_completion.py
├── app.py
└── README.md


/docs/issues.md:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llama2_wrapper/server/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llama2_wrapper/download/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llama2_wrapper/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import LLAMA2_WRAPPER, get_prompt, get_prompt_for_dialog
2 | 


--------------------------------------------------------------------------------
/static/screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/liltom-eth/llama2-webui/HEAD/static/screenshot.png


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | models
 2 | dist
 3 | 
 4 | .DS_Store
 5 | .vscode
 6 | 
 7 | __pycache__
 8 | gradio_cached_examples
 9 | 
10 | .pytest_cache


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | accelerate==0.21.0
 2 | auto-gptq==0.3.0
 3 | bitsandbytes==0.40.2
 4 | gradio==3.37.0
 5 | protobuf==3.20.3
 6 | scipy==1.11.1
 7 | sentencepiece==0.1.99
 8 | torch==2.0.1
 9 | transformers==4.31.0
10 | tqdm==4.65.0
11 | python-dotenv==1.0.0
12 | llama-cpp-python==0.2.11
13 | memory-profiler==0.61.0
14 | huggingface-hub==0.16.4
15 | fastapi==0.100.0
16 | uvicorn==0.23.1
17 | sse-starlette==1.6.5
18 | pydantic==2.2.1
19 | pydantic-settings==2.0.3
20 | pytest==7.4.0
21 | black==23.7.0
22 | 


--------------------------------------------------------------------------------
/.env:
--------------------------------------------------------------------------------
 1 | MODEL_PATH = ""
 2 | # if MODEL_PATH is "", default llama.cpp/gptq models 
 3 | # will be downloaded to: ./models
 4 | 
 5 | # Example ggml path:
 6 | # MODEL_PATH = "./models/llama-2-7b-chat.ggmlv3.q4_0.bin"
 7 | 
 8 | # options: llama.cpp, gptq, transformers
 9 | BACKEND_TYPE = "llama.cpp"
10 | 
11 | # only for transformers bitsandbytes 8 bit
12 | LOAD_IN_8BIT = False
13 | 
14 | MAX_MAX_NEW_TOKENS = 2048
15 | DEFAULT_MAX_NEW_TOKENS = 1024
16 | MAX_INPUT_TOKEN_LENGTH = 4000
17 | 
18 | DEFAULT_SYSTEM_PROMPT = ""
19 | 


--------------------------------------------------------------------------------
/env_examples/.env.13b_example:
--------------------------------------------------------------------------------
 1 | MODEL_PATH = "./models/Llama-2-13b-chat-hf"
 2 | 
 3 | # options: llama.cpp, gptq, transformers
 4 | BACKEND_TYPE = "transformers"
 5 | 
 6 | # only for transformers bitsandbytes 8 bit
 7 | LOAD_IN_8BIT = True
 8 | 
 9 | MAX_MAX_NEW_TOKENS = 2048
10 | DEFAULT_MAX_NEW_TOKENS = 1024
11 | MAX_INPUT_TOKEN_LENGTH = 4000
12 | 
13 | DEFAULT_SYSTEM_PROMPT = "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."
14 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: Release
 2 | on:
 3 |   release:
 4 |     types:
 5 |       - created
 6 | 
 7 | jobs:
 8 |   publish:
 9 |     strategy:
10 |       fail-fast: false
11 |       matrix:
12 |         python-version: ['3.10']
13 |         poetry-version: ['1.5.1']
14 |         os: [ubuntu-latest]
15 |     runs-on: ${{ matrix.os }}
16 |     steps:
17 |       - uses: actions/checkout@v3
18 |       - uses: actions/setup-python@v3
19 |         with:
20 |           python-version: ${{ matrix.python-version }}
21 |       - name: Run image
22 |         uses: abatilo/actions-poetry@v2.1.4
23 |         with:
24 |           poetry-version: ${{ matrix.poetry-version }}
25 |       - name: Publish
26 |         env:
27 |           PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
28 |         run: |
29 |           poetry config pypi-token.pypi $PYPI_TOKEN
30 |           poetry publish --build
31 | 


--------------------------------------------------------------------------------
/env_examples/.env.7b_8bit_example:
--------------------------------------------------------------------------------
 1 | MODEL_PATH = "./models/Llama-2-7b-chat-hf"
 2 | 
 3 | # options: llama.cpp, gptq, transformers
 4 | BACKEND_TYPE = "transformers"
 5 | 
 6 | # only for transformers bitsandbytes 8 bit
 7 | LOAD_IN_8BIT = True
 8 | 
 9 | MAX_MAX_NEW_TOKENS = 2048
10 | DEFAULT_MAX_NEW_TOKENS = 1024
11 | MAX_INPUT_TOKEN_LENGTH = 4000
12 | 
13 | DEFAULT_SYSTEM_PROMPT = "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."
14 | 


--------------------------------------------------------------------------------
/env_examples/.env.7b_ggmlv3_q4_0_example:
--------------------------------------------------------------------------------
 1 | MODEL_PATH = ""
 2 | # if MODEL_PATH is "", default llama.cpp/gptq models 
 3 | # will be downloaded to: ./models
 4 | 
 5 | # Example ggml path:
 6 | # MODEL_PATH = "./models/llama-2-7b-chat.ggmlv3.q4_0.bin"
 7 | 
 8 | # options: llama.cpp, gptq, transformers
 9 | BACKEND_TYPE = "llama.cpp"
10 | 
11 | # only for transformers bitsandbytes 8 bit
12 | LOAD_IN_8BIT = False
13 | 
14 | MAX_MAX_NEW_TOKENS = 2048
15 | DEFAULT_MAX_NEW_TOKENS = 1024
16 | MAX_INPUT_TOKEN_LENGTH = 4000
17 | 
18 | DEFAULT_SYSTEM_PROMPT = "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."
19 | 


--------------------------------------------------------------------------------
/env_examples/.env.7b_gptq_example:
--------------------------------------------------------------------------------
 1 | MODEL_PATH = "./models/Llama-2-7b-Chat-GPTQ"
 2 | # if MODEL_PATH is "", default llama.cpp/gptq models 
 3 | # will be downloaded to: ./models
 4 | 
 5 | # Example gptq path:
 6 | # MODEL_PATH = "./models/Llama-2-7b-Chat-GPTQ"
 7 | 
 8 | # options: llama.cpp, gptq, transformers
 9 | BACKEND_TYPE = "gptq"
10 | 
11 | # only for transformers bitsandbytes 8 bit
12 | LOAD_IN_8BIT = False
13 | 
14 | MAX_MAX_NEW_TOKENS = 2048
15 | DEFAULT_MAX_NEW_TOKENS = 1024
16 | MAX_INPUT_TOKEN_LENGTH = 4000
17 | 
18 | DEFAULT_SYSTEM_PROMPT = "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."
19 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Tom
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "llama2-wrapper"
 3 | version = "0.1.14"
 4 | description = "Use llama2-wrapper as your local llama2 backend for Generative Agents / Apps"
 5 | authors = ["liltom-eth <liltom.eth@gmail.com>"]
 6 | license = "MIT"
 7 | homepage = "https://github.com/liltom-eth/llama2-webui"
 8 | repository = "https://github.com/liltom-eth/llama2-webui"
 9 | readme = "./docs/pypi.md"
10 | 
11 | packages = [{include = "llama2_wrapper"}]
12 | 
13 | [tool.poetry.dependencies]
14 | python = ">=3.10,<3.13"
15 | accelerate = "^0.21.0"
16 | auto-gptq = "0.3.0"
17 | gradio = "3.37.0"
18 | protobuf = "3.20.3"
19 | scipy = "1.11.1"
20 | sentencepiece = "0.1.99"
21 | torch = "2.0.1"
22 | transformers = "4.31.0"
23 | tqdm = "4.65.0"
24 | python-dotenv = "1.0.0"
25 | llama-cpp-python = "0.2.11"
26 | bitsandbytes = [
27 |     {platform = 'linux', version = "0.40.2"},
28 |     {platform = 'darwin', version = "0.40.2"},
29 | ]
30 | memory-profiler = "0.61.0"
31 | huggingface-hub = "0.16.4"
32 | fastapi = "0.100.0"
33 | uvicorn = "0.23.1"
34 | sse-starlette = "1.6.5"
35 | pydantic = "2.2.1"
36 | pydantic-settings = "2.0.3"
37 | pytest = "7.4.0"
38 | black = "23.7.0"
39 | 
40 | 
41 | [build-system]
42 | requires = ["poetry-core"]
43 | build-backend = "poetry.core.masonry.api"
44 | 
45 | [virtualenvs]
46 | create = true
47 | in-project = true


--------------------------------------------------------------------------------
/llama2_wrapper/server/__main__.py:
--------------------------------------------------------------------------------
 1 | """Example FastAPI server for llama2_wrapper.
 2 | 
 3 | To run this example:
 4 | 
 5 | ```
 6 | python3 -m llama2_wrapper.server
 7 | ```
 8 | 
 9 | or
10 | 
11 | ```
12 | uvicorn llama2_wrapper.server.app:app --reload
13 | ```
14 | 
15 | Then visit http://localhost:8000/docs to see the interactive API docs.
16 | 
17 | """
18 | import os
19 | import argparse
20 | 
21 | import uvicorn
22 | 
23 | from llama2_wrapper.server.app import create_app, Settings
24 | 
25 | if __name__ == "__main__":
26 |     parser = argparse.ArgumentParser()
27 |     for name, field in Settings.model_fields.items():
28 |         description = field.description
29 |         if field.default is not None and description is not None:
30 |             description += f" (default: {field.default})"
31 |         parser.add_argument(
32 |             f"--{name}",
33 |             dest=name,
34 |             type=field.annotation if field.annotation is not None else str,
35 |             help=description,
36 |         )
37 | 
38 |     args = parser.parse_args()
39 |     settings = Settings(**{k: v for k, v in vars(args).items() if v is not None})
40 |     app = create_app(settings=settings)
41 | 
42 |     uvicorn.run(
43 |         app,
44 |         host=os.getenv("HOST", settings.host),
45 |         port=int(os.getenv("PORT", settings.port)),
46 |     )
47 | 


--------------------------------------------------------------------------------
/prompts/utils.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import os
 3 | from hashlib import md5
 4 | 
 5 | 
 6 | def read_csv_to_dict_list(file_path):
 7 |     with open(file_path, mode="r", encoding="utf-8") as file:
 8 |         reader = csv.DictReader(file)
 9 |         list_of_dicts = [row for row in reader]
10 |         return list_of_dicts
11 | 
12 | 
13 | def split_list_with_key(lst, dict_key):
14 |     result = {}
15 |     for row in lst:
16 |         if row.get(dict_key) not in result:
17 |             result[row.get(dict_key)] = []
18 |         result[row.get(dict_key)].append(row)
19 |     return result
20 | 
21 | 
22 | def read_csv_to_type_dict(file_path, type_key):
23 |     lst = read_csv_to_dict_list(file_path=file_path)
24 |     return split_list_with_key(lst=lst, dict_key=type_key)
25 | 
26 | 
27 | def md5_str(str):
28 |     return md5(str.encode("utf8")).hexdigest()
29 | 
30 | 
31 | current_dir = os.path.dirname(__file__)
32 | 
33 | 
34 | class PromtsContainer(object):
35 |     def __init__(self) -> None:
36 |         prompts_path = os.path.join(current_dir, "prompts_en.csv")
37 |         self.data = read_csv_to_type_dict(prompts_path, "type")
38 |         self.summary_dict = {
39 |             md5_str(row.get("summary")): row.get("prompt")
40 |             for chunk in self.data.values()
41 |             for row in chunk
42 |         }
43 | 
44 |     def get_prompts_tab_dict(self):
45 |         return self.data
46 | 
47 |     def get_prompt_by_summary(self, summary):
48 |         return self.summary_dict.get(md5_str(summary), summary)
49 | 


--------------------------------------------------------------------------------
/llama2_wrapper/download/__main__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | 
 4 | 
 5 | def main():
 6 |     parser = argparse.ArgumentParser()
 7 |     parser.add_argument(
 8 |         "--repo_id",
 9 |         type=str,
10 |         default="",
11 |         required=True,
12 |         help="Repo ID like 'TheBloke/Llama-2-7B-Chat-GGML' ",
13 |     )
14 |     parser.add_argument(
15 |         "--filename",
16 |         type=str,
17 |         default=None,
18 |         help="Filename like llama-2-7b-chat.ggmlv3.q4_0.bin",
19 |     )
20 |     parser.add_argument(
21 |         "--save_dir", type=str, default="./models", help="Directory to save models"
22 |     )
23 | 
24 |     args = parser.parse_args()
25 | 
26 |     repo_id = args.repo_id
27 |     save_dir = args.save_dir
28 | 
29 |     if not os.path.exists(save_dir):
30 |         os.makedirs(save_dir)
31 | 
32 |     if args.filename:
33 |         filename = args.filename
34 |         from huggingface_hub import hf_hub_download
35 | 
36 |         print(f"Start downloading model {repo_id} {filename} to: {save_dir}")
37 | 
38 |         hf_hub_download(
39 |             repo_id=repo_id,
40 |             filename=filename,
41 |             local_dir=save_dir,
42 |         )
43 |     else:
44 |         repo_name = repo_id.split("/")[1]
45 |         save_path = os.path.join(save_dir, repo_name)
46 |         if not os.path.exists(save_path):
47 |             os.makedirs(save_path)
48 |         print(f"Start downloading model {repo_id} to: {save_path}")
49 | 
50 |         from huggingface_hub import snapshot_download
51 | 
52 |         snapshot_download(
53 |             repo_id=repo_id,
54 |             local_dir=save_path,
55 |         )
56 | 
57 | 
58 | if __name__ == "__main__":
59 |     main()
60 | 


--------------------------------------------------------------------------------
/.github/workflows/branch.yml:
--------------------------------------------------------------------------------
 1 | name: Push
 2 | on: [push]
 3 | 
 4 | jobs:
 5 |   test:
 6 |     strategy:
 7 |       fail-fast: false
 8 |       matrix:
 9 |         python-version: ['3.10']
10 |         poetry-version: ['1.5.1']
11 |         os: [ubuntu-latest]
12 |     runs-on: ${{ matrix.os }}
13 |     steps:
14 |       - uses: actions/checkout@v3
15 |       - uses: actions/setup-python@v3
16 |         with:
17 |           python-version: ${{ matrix.python-version }}
18 |       - name: Run image
19 |         uses: abatilo/actions-poetry@v2.1.4
20 |         with:
21 |           poetry-version: ${{ matrix.poetry-version }}
22 |       - name: Install dependencies
23 |         run: poetry install
24 |       - name: Run tests
25 |         run: poetry run pytest
26 |       - name: Upload coverage reports to Codecov
27 |         uses: codecov/codecov-action@v3
28 |         env:
29 |           CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
30 |       # - name: Upload coverage to Codecov
31 |       #   uses: codecov/codecov-action@v2
32 |   code-quality:
33 |     strategy:
34 |       fail-fast: false
35 |       matrix:
36 |         python-version: ['3.10']
37 |         poetry-version: ['1.5.1']
38 |         os: [ubuntu-latest]
39 |     runs-on: ${{ matrix.os }}
40 |     steps:
41 |       - uses: actions/checkout@v3
42 |       - uses: actions/setup-python@v3
43 |         with:
44 |           python-version: ${{ matrix.python-version }}
45 |       - name: Python Poetry Action
46 |         uses: abatilo/actions-poetry@v2.1.6
47 |         with:
48 |           poetry-version: ${{ matrix.poetry-version }}
49 |       - name: Install dependencies
50 |         run: poetry install
51 |       - name: Run black
52 |         run: poetry run black . --check
53 |       # - name: Run isort
54 |       #   run: poetry run isort . --check-only --profile black
55 |       # - name: Run flake8
56 |       #   run: poetry run flake8 .
57 |       # - name: Run bandit
58 |       #   run: poetry run bandit .
59 |       # - name: Run saftey
60 |       #   run: poetry run safety check
61 | 


--------------------------------------------------------------------------------
/tests/test_get_prompt.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from llama2_wrapper.model import get_prompt_for_dialog
 3 | 
 4 | 
 5 | class TestClassGetPromptForDialog:
 6 |     from llama2_wrapper.types import Message
 7 | 
 8 |     dialog = []
 9 |     message1 = Message(
10 |         role="system",
11 |         content="You are a helpful, respectful and honest assistant. ",
12 |     )
13 |     message2 = Message(
14 |         role="user",
15 |         content="Hi do you know Pytorch?",
16 |     )
17 |     dialog.append(message1)
18 |     dialog.append(message2)
19 | 
20 |     dialog2 = []
21 |     dialog2.append(message1)
22 |     dialog2.append(message2)
23 |     message3 = Message(
24 |         role="assistant",
25 |         content="Yes I know Pytorch. ",
26 |     )
27 |     message4 = Message(
28 |         role="user",
29 |         content="Can you write a CNN in Pytorch?",
30 |     )
31 |     dialog2.append(message3)
32 |     dialog2.append(message4)
33 | 
34 |     dialog3 = []
35 |     dialog3.append(message3)
36 |     dialog3.append(message4)
37 |     dialog3.append(message3)
38 |     dialog3.append(message4)
39 |     message5 = Message(
40 |         role="assistant",
41 |         content="Yes I can write a CNN in Pytorch.",
42 |     )
43 |     dialog3.append(message5)
44 | 
45 |     def test_dialog1(self):
46 |         prompt = get_prompt_for_dialog(self.dialog)
47 |         # print(prompt)
48 |         result = """[INST] <<SYS>>\nYou are a helpful, respectful and honest assistant. \n<</SYS>>\n\nHi do you know Pytorch? [/INST]"""
49 |         assert prompt == result
50 | 
51 |     def test_dialog2(self):
52 |         prompt = get_prompt_for_dialog(self.dialog2)
53 |         # print(prompt)
54 |         result = """[INST] <<SYS>>\nYou are a helpful, respectful and honest assistant. \n<</SYS>>\n\nHi do you know Pytorch? [/INST] Yes I know Pytorch. [INST] Can you write a CNN in Pytorch? [/INST]"""
55 |         assert prompt == result
56 | 
57 |     def test_dialog3(self):
58 |         with pytest.raises(AssertionError):
59 |             prompt = get_prompt_for_dialog(self.dialog3)
60 | 


--------------------------------------------------------------------------------
/docs/news.md:
--------------------------------------------------------------------------------
 1 | # News
 2 | - [2023/09] The newest `llama2-wrapper>=0.1.14` supports llama.cpp's `gguf` models.
 3 | 
 4 | - [2023/08] 🔥 For developers, we offer a web server that acts as a drop-in replacement for the OpenAI API.
 5 | 
 6 |   - Usage: 
 7 | 
 8 |     ```
 9 |     python3 -m llama2_wrapper.server
10 |     ```
11 | 
12 | 
13 | 
14 | - [2023/08] 🔥 For developers, we released `llama2-wrapper`  as a llama2 backend wrapper in [PYPI](https://pypi.org/project/llama2-wrapper/).
15 | 
16 |   - Install: `pip install llama2-wrapper`
17 | 
18 |   - Usage: 
19 | 
20 |     ```python
21 |     from llama2_wrapper import LLAMA2_WRAPPER, get_prompt 
22 |     llama2_wrapper = LLAMA2_WRAPPER(
23 |         model_path="./models/Llama-2-7B-Chat-GGML/llama-2-7b-chat.ggmlv3.q4_0.bin",
24 |         backend_type="llama.cpp", #options: llama.cpp, transformers, gptq
25 |     )
26 |     prompt = "Do you know Pytorch"
27 |     llama2_promt = get_prompt(prompt)
28 |     answer = llama2_wrapper(llama2_promt, temperature=0.9)
29 |     ```
30 | 
31 | - [2023/08] 🔥 We added `benchmark.py` for users to benchmark llama2 models on their local devices.
32 | 
33 |   - Check/contribute the performance of your device in the full [performance doc](https://github.com/liltom-eth/llama2-webui/blob/main/docs/performance.md).
34 | 
35 | - [2023/07] We released **[llama2-webui](https://github.com/liltom-eth/llama2-webui)**, a gradio web UI to run Llama 2 on GPU or CPU from anywhere (Linux/Windows/Mac). 
36 | 
37 |   - Supporting models: [Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)/[13b](https://huggingface.co/llamaste/Llama-2-13b-chat-hf)/[70b](https://huggingface.co/llamaste/Llama-2-70b-chat-hf), all [Llama-2-GPTQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-GPTQ), all [Llama-2-GGML](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML) ...
38 |   - Supporting model backends:  [tranformers](https://github.com/huggingface/transformers), [bitsandbytes(8-bit inference)](https://github.com/TimDettmers/bitsandbytes), [AutoGPTQ(4-bit inference)](https://github.com/PanQiWei/AutoGPTQ), [llama.cpp](https://github.com/ggerganov/llama.cpp)


--------------------------------------------------------------------------------
/docs/performance.md:
--------------------------------------------------------------------------------
 1 | # Benchmark Performance
 2 | 
 3 | ## Performance on Nvidia GPU
 4 | 
 5 | | Model                             | Precision | Device | GPU VRAM | Speed (tokens/sec) | load time (s) |
 6 | | --------------------------------- | --------- | ---------- | ---------------------- | ---------------- | ---------------- |
 7 | | Llama-2-7b-chat-hf | 16 bit |  |  |              |              |
 8 | | Llama-2-7b-chat-hf          |   8bit   | NVIDIA RTX 2080 Ti    | 7.7 GB VRAM | 3.76 | 641.36 |
 9 | | Llama-2-7b-Chat-GPTQ        |   4bit   | NVIDIA RTX 2080 Ti    | 5.8 GB VRAM | 18.85 | 192.91 |
10 | | Llama-2-7b-Chat-GPTQ        |   4bit   | NVIDIA GTX 1660 Super | 4.8 GB VRAM | 8.5   | 262.74        |
11 | | Llama-2-7b-Chat-GPTQ | 4 bit | Google Colab T4 | 5.8 GB VRAM | 18.19 | 37.44 |
12 | | Llama-2-13b-chat-hf               |   16 bit   |  |                  |                  |                  |
13 | |  |  | |  | | |
14 | 
15 | ## Performance on CPU / OpenBLAS / cuBLAS / CLBlast / Metal
16 | 
17 | | Model                             | Precision | Device | RAM / GPU VRAM | Speed (tokens/sec) | load time (s) |
18 | | --------------------------------- | --------- | ---------- | ---------------------- | ---------------- | ---------------- |
19 | | llama-2-7b-chat.ggmlv3.q2_K | 2 bit     | Intel i7-8700 | 4.5 GB RAM     | 7.88               | 31.90         |
20 | | llama-2-7b-chat.ggmlv3.q2_K | 2 bit | Apple M2 CPU | 4.5 GB RAM | 11.10 | 0.10 |
21 | | llama-2-7b-chat.ggmlv3.q2_K | 2 bit | Apple M2 Metal | 4.5 GB RAM | 12.10 | 0.12 |
22 | | llama-2-7b-chat.ggmlv3.q4_0 | 4 bit     | Intel i7-8700 | 5.4 GB RAM     | 6.27            | 173.15 |
23 | | llama-2-7b-chat.ggmlv3.q4_0 | 4 bit     | Intel i7-9700 | 4.8 GB RAM   | 4.2                 | 87.9        |
24 | | llama-2-7b-chat.ggmlv3.q4_0 | 4 bit | Apple M1 Pro CPU | 5.4 GB RAM | 17.90 | 0.18 |
25 | | llama-2-7b-chat.ggmlv3.q4_0 | 4 bit     | Apple M2 CPU | 5.4 GB RAM | 13.70 | 0.13 |
26 | | llama-2-7b-chat.ggmlv3.q4_0 | 4 bit | Apple M2 Metal | 5.4 GB RAM | 12.60 | 0.10 |
27 | | llama-2-7b-chat.ggmlv3.q4_0 | 4 bit | AMD Ryzen 9 5900HS | 4.1 GB RAM | 6.01 | 0.15 |
28 | | llama-2-7b-chat.ggmlv3.q4_0 | 4 bit | Intel vServer 4 threads, eth services | 8 GB RAM | 1.31 | 0.5|
29 | | llama-2-7b-chat.ggmlv3.q8_0 | 8 bit | Intel i7-8700 | 8.6 GB RAM | 2.63 | 336.57 |
30 | | llama-2-7b-chat.ggmlv3.q8_0 | 8 bit     | Intel i7-9700 | 7.6 GB RAM   | 2.05              | 302.9    |
31 | |  |  |  |  |  |  |
32 | 
33 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to [llama2-webui](https://github.com/liltom-eth/llama2-webui)
 2 | 
 3 | We love your input! We want to make contributing to this project as easy and transparent as possible, whether it's:
 4 | 
 5 | - Reporting a bug
 6 | - Proposing new features
 7 | - Discussing the current state of the code
 8 | - Update README.md
 9 | - Submitting a PR
10 | 
11 | ## Using GitHub's [issues](https://github.com/liltom-eth/llama2-webui/issues)
12 | 
13 | We use GitHub issues to track public bugs. Report a bug by [opening a new issue](https://github.com/liltom-eth/llama2-webui/issues). It's that easy!
14 | 
15 | Thanks for **[jlb1504](https://github.com/jlb1504)** for reporting the [first issue](https://github.com/liltom-eth/llama2-webui/issues/1)!
16 | 
17 | **Great Bug Reports** tend to have:
18 | 
19 | - A quick summary and/or background
20 | - Steps to reproduce
21 |   - Be specific!
22 |   - Give a sample code if you can.
23 | - What you expected would happen
24 | - What actually happens
25 | - Notes (possibly including why you think this might be happening, or stuff you tried that didn't work)
26 | 
27 | Proposing new features are also welcome.
28 | 
29 | ## Pull Request
30 | 
31 | All pull requests are welcome. For example, you update the `README.md` to help users to better understand the usage.
32 | 
33 | ### Clone the repository
34 | 
35 | 1. Create a user account on GitHub if you do not already have one.
36 | 
37 | 2. Fork the project [repository](https://github.com/liltom-eth/llama2-webui): click on the *Fork* button near the top of the page. This creates a copy of the code under your account on GitHub.
38 | 
39 | 3. Clone this copy to your local disk:
40 | 
41 |    ```
42 |    git clone git@github.com:liltom-eth/llama2-webui.git
43 |    cd llama2-webui
44 |    ```
45 | 
46 | ### Implement your changes
47 | 
48 | 1. Create a branch to hold your changes:
49 | 
50 |    ```
51 |    git checkout -b my-feature
52 |    ```
53 | 
54 |    and start making changes. Never work on the main branch!
55 | 
56 | 2. Start your work on this branch. 
57 | 
58 | 3. When you’re done editing, do:
59 | 
60 |    ```
61 |    git add <MODIFIED FILES>
62 |    git commit
63 |    ```
64 | 
65 |    to record your changes in [git](https://git-scm.com/).
66 | 
67 | ### Submit your contribution
68 | 
69 | 1. If everything works fine, push your local branch to the remote server with:
70 | 
71 |    ```
72 |    git push -u origin my-feature
73 |    ```
74 | 
75 | 2. Go to the web page of your fork and click "Create pull request" to send your changes for review.
76 | 
77 |    ```{todo}
78 |       Find more detailed information in [creating a PR]. You might also want to open
79 |       the PR as a draft first and mark it as ready for review after the feedbacks
80 |       from the continuous integration (CI) system or any required fixes.
81 |    ```
82 | 
83 | ## License
84 | 
85 | By contributing, you agree that your contributions will be licensed under its MIT License.
86 | 
87 | ## Questions?
88 | 
89 | Email us at [liltom.eth@gmail.com](mailto:liltom.eth@gmail.com)
90 | 
91 | 


--------------------------------------------------------------------------------
/llama2_wrapper/types.py:
--------------------------------------------------------------------------------
  1 | from typing import Any, List, Optional, Dict, Union
  2 | from typing_extensions import TypedDict, NotRequired, Literal
  3 | 
  4 | B_INST, E_INST = "[INST]", "[/INST]"
  5 | B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
  6 | 
  7 | 
  8 | # Role = Literal["system", "user", "assistant"]
  9 | # class Message(TypedDict):
 10 | #     role: Role
 11 | #     content: str
 12 | 
 13 | 
 14 | class ChatCompletionMessage(TypedDict):
 15 |     role: Literal["assistant", "user", "system"]
 16 |     content: str
 17 |     user: NotRequired[str]
 18 | 
 19 | 
 20 | # transformers: Message; llama.cpp: ChatCompletionMessage
 21 | Message = ChatCompletionMessage
 22 | Dialog = List[Message]
 23 | 
 24 | 
 25 | class EmbeddingUsage(TypedDict):
 26 |     prompt_tokens: int
 27 |     total_tokens: int
 28 | 
 29 | 
 30 | class EmbeddingData(TypedDict):
 31 |     index: int
 32 |     object: str
 33 |     embedding: List[float]
 34 | 
 35 | 
 36 | class Embedding(TypedDict):
 37 |     object: Literal["list"]
 38 |     model: str
 39 |     data: List[EmbeddingData]
 40 |     usage: EmbeddingUsage
 41 | 
 42 | 
 43 | class CompletionLogprobs(TypedDict):
 44 |     text_offset: List[int]
 45 |     token_logprobs: List[Optional[float]]
 46 |     tokens: List[str]
 47 |     top_logprobs: List[Optional[Dict[str, float]]]
 48 | 
 49 | 
 50 | class CompletionChoice(TypedDict):
 51 |     text: str
 52 |     index: int
 53 |     logprobs: Optional[CompletionLogprobs]
 54 |     finish_reason: Optional[str]
 55 | 
 56 | 
 57 | class CompletionUsage(TypedDict):
 58 |     prompt_tokens: int
 59 |     completion_tokens: int
 60 |     total_tokens: int
 61 | 
 62 | 
 63 | class CompletionChunk(TypedDict):
 64 |     id: str
 65 |     object: Literal["text_completion"]
 66 |     created: int
 67 |     model: str
 68 |     choices: List[CompletionChoice]
 69 | 
 70 | 
 71 | class Completion(TypedDict):
 72 |     id: str
 73 |     object: Literal["text_completion"]
 74 |     created: int
 75 |     model: str
 76 |     choices: List[CompletionChoice]
 77 |     usage: CompletionUsage
 78 | 
 79 | 
 80 | class ChatCompletionChoice(TypedDict):
 81 |     index: int
 82 |     message: ChatCompletionMessage
 83 |     finish_reason: Optional[str]
 84 | 
 85 | 
 86 | class ChatCompletion(TypedDict):
 87 |     id: str
 88 |     object: Literal["chat.completion"]
 89 |     created: int
 90 |     model: str
 91 |     choices: List[ChatCompletionChoice]
 92 |     usage: CompletionUsage
 93 | 
 94 | 
 95 | class ChatCompletionChunkDeltaEmpty(TypedDict):
 96 |     pass
 97 | 
 98 | 
 99 | class ChatCompletionChunkDelta(TypedDict):
100 |     role: NotRequired[Literal["assistant"]]
101 |     content: NotRequired[str]
102 | 
103 | 
104 | class ChatCompletionChunkChoice(TypedDict):
105 |     index: int
106 |     delta: Union[ChatCompletionChunkDelta, ChatCompletionChunkDeltaEmpty]
107 |     finish_reason: Optional[str]
108 | 
109 | 
110 | class ChatCompletionChunk(TypedDict):
111 |     id: str
112 |     model: str
113 |     object: Literal["chat.completion.chunk"]
114 |     created: int
115 |     choices: List[ChatCompletionChunkChoice]
116 | 


--------------------------------------------------------------------------------
/colab/ggmlv3_q4_0.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "provenance": [],
  7 |       "toc_visible": true,
  8 |       "authorship_tag": "ABX9TyM9WbudQYrVFksXUrt4Opt3",
  9 |       "include_colab_link": true
 10 |     },
 11 |     "kernelspec": {
 12 |       "name": "python3",
 13 |       "display_name": "Python 3"
 14 |     },
 15 |     "language_info": {
 16 |       "name": "python"
 17 |     }
 18 |   },
 19 |   "cells": [
 20 |     {
 21 |       "cell_type": "markdown",
 22 |       "metadata": {
 23 |         "id": "view-in-github",
 24 |         "colab_type": "text"
 25 |       },
 26 |       "source": [
 27 |         "<a href=\"https://colab.research.google.com/github/liltom-eth/llama2-webui/blob/main/colab/ggmlv3_q4_0.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
 28 |       ]
 29 |     },
 30 |     {
 31 |       "cell_type": "code",
 32 |       "execution_count": null,
 33 |       "metadata": {
 34 |         "id": "7O5JSosg5-rx"
 35 |       },
 36 |       "outputs": [],
 37 |       "source": [
 38 |         "%cd /content\n",
 39 |         "!pip install llama2-wrapper\n"
 40 |       ]
 41 |     },
 42 |     {
 43 |       "cell_type": "code",
 44 |       "source": [
 45 |         "from llama2_wrapper import LLAMA2_WRAPPER, get_prompt\n",
 46 |         "\n",
 47 |         "llama2_wrapper = LLAMA2_WRAPPER()"
 48 |       ],
 49 |       "metadata": {
 50 |         "colab": {
 51 |           "base_uri": "https://localhost:8080/"
 52 |         },
 53 |         "id": "8rgb1ckl72wC",
 54 |         "outputId": "d9ca2e20-26a5-490b-86f2-1a182e533b20"
 55 |       },
 56 |       "execution_count": 5,
 57 |       "outputs": [
 58 |         {
 59 |           "output_type": "stream",
 60 |           "name": "stdout",
 61 |           "text": [
 62 |             "Running on backend llama.cpp.\n",
 63 |             "Use default model path: ./models/llama-2-7b-chat.ggmlv3.q4_0.bin\n",
 64 |             "Start downloading model to: ./models/llama-2-7b-chat.ggmlv3.q4_0.bin\n"
 65 |           ]
 66 |         }
 67 |       ]
 68 |     },
 69 |     {
 70 |       "cell_type": "code",
 71 |       "source": [
 72 |         "prompt = get_prompt(\"Hi do you know Pytorch?\")\n",
 73 |         "print(llama2_wrapper(prompt))"
 74 |       ],
 75 |       "metadata": {
 76 |         "id": "Qz2xAqozTIf6",
 77 |         "colab": {
 78 |           "base_uri": "https://localhost:8080/"
 79 |         },
 80 |         "outputId": "1380fa52-3d4a-4ac5-ed02-7faefe7ec2f6"
 81 |       },
 82 |       "execution_count": 3,
 83 |       "outputs": [
 84 |         {
 85 |           "output_type": "stream",
 86 |           "name": "stdout",
 87 |           "text": [
 88 |             "  Yes, I'm familiar with PyTorch! PyTorch is an open-source deep learning framework that is widely used for building and training neural networks. It was originally developed by Facebook and is now maintained by the PyTorch Foundation.\n",
 89 |             "\n",
 90 |             "Here are some key features and capabilities of PyTorch:\n",
 91 |             "\n",
 92 |             "1. **Tensor Computation**: PyTorch provides a powerful tensor computation engine that allows for complex mathematical operations on large datasets.\n",
 93 |             "2. **Autograd**: PyTorch's autograd system automatically computes gradients, which can save a lot of time and effort during training.\n",
 94 |             "3. **Dynamic Compute**: PyTorch's dynamic compute system allows for more efficient computation by only computing the necessary computations at runtime.\n",
 95 |             "4. **Memory-efficient**: PyTorch is designed to be memory-efficient, which is important for training large models that require a lot of memory.\n",
 96 |             "5. **Accelerators**: PyTorch supports a wide range of accelerators, including GPUs, TPUs, and FPGAs, which can significantly speed up training times.\n",
 97 |             "6. **Modules**: PyTorch provides a wide range of pre-built modules for common tasks, such as convolutional layers, recurrent neural networks, and more.\n",
 98 |             "7. **Extensive Community**: PyTorch has a large and active community of developers and users, which can be helpful for getting support and staying up-to-date with the latest developments.\n",
 99 |             "8. **Easy Integration**: PyTorch can be easily integrated with other popular deep learning frameworks, such as TensorFlow and Keras.\n",
100 |             "9. **Pythonic**: PyTorch is written in Python, which is a popular and easy-to-learn programming language.\n",
101 |             "10. **Flexible**: PyTorch allows for a wide range of customization options, which can be useful for building and training unique models.\n",
102 |             "\n",
103 |             "Overall, PyTorch is a powerful and flexible deep learning framework that can be used for a wide range of applications, including computer vision, natural language processing, and more.\n"
104 |           ]
105 |         }
106 |       ]
107 |     }
108 |   ]
109 | }


--------------------------------------------------------------------------------
/benchmark.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import argparse
  4 | 
  5 | from dotenv import load_dotenv
  6 | from distutils.util import strtobool
  7 | from memory_profiler import memory_usage
  8 | from tqdm import tqdm
  9 | 
 10 | from llama2_wrapper import LLAMA2_WRAPPER
 11 | 
 12 | 
 13 | def run_iteration(
 14 |     llama2_wrapper, prompt_example, DEFAULT_SYSTEM_PROMPT, DEFAULT_MAX_NEW_TOKENS
 15 | ):
 16 |     def generation():
 17 |         generator = llama2_wrapper.run(
 18 |             prompt_example,
 19 |             [],
 20 |             DEFAULT_SYSTEM_PROMPT,
 21 |             DEFAULT_MAX_NEW_TOKENS,
 22 |             1,
 23 |             0.95,
 24 |             50,
 25 |         )
 26 |         model_response = None
 27 |         try:
 28 |             first_model_response = next(generator)
 29 |         except StopIteration:
 30 |             pass
 31 |         for model_response in generator:
 32 |             pass
 33 |         return llama2_wrapper.get_token_length(model_response), model_response
 34 | 
 35 |     tic = time.perf_counter()
 36 |     mem_usage, (output_token_length, model_response) = memory_usage(
 37 |         (generation,), max_usage=True, retval=True
 38 |     )
 39 |     toc = time.perf_counter()
 40 | 
 41 |     generation_time = toc - tic
 42 |     tokens_per_second = output_token_length / generation_time
 43 | 
 44 |     return generation_time, tokens_per_second, mem_usage, model_response
 45 | 
 46 | 
 47 | def main():
 48 |     parser = argparse.ArgumentParser()
 49 |     parser.add_argument("--iter", type=int, default=5, help="Number of iterations")
 50 |     parser.add_argument("--model_path", type=str, default="", help="model path")
 51 |     parser.add_argument(
 52 |         "--backend_type",
 53 |         type=str,
 54 |         default="",
 55 |         help="Backend options: llama.cpp, gptq, transformers",
 56 |     )
 57 |     parser.add_argument(
 58 |         "--load_in_8bit",
 59 |         type=bool,
 60 |         default=False,
 61 |         help="Whether to use bitsandbytes 8 bit.",
 62 |     )
 63 | 
 64 |     args = parser.parse_args()
 65 | 
 66 |     load_dotenv()
 67 | 
 68 |     DEFAULT_SYSTEM_PROMPT = os.getenv("DEFAULT_SYSTEM_PROMPT", "")
 69 |     MAX_MAX_NEW_TOKENS = int(os.getenv("MAX_MAX_NEW_TOKENS", 2048))
 70 |     DEFAULT_MAX_NEW_TOKENS = int(os.getenv("DEFAULT_MAX_NEW_TOKENS", 1024))
 71 |     MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", 4000))
 72 | 
 73 |     MODEL_PATH = os.getenv("MODEL_PATH")
 74 |     assert MODEL_PATH is not None, f"MODEL_PATH is required, got: {MODEL_PATH}"
 75 |     BACKEND_TYPE = os.getenv("BACKEND_TYPE")
 76 |     assert BACKEND_TYPE is not None, f"BACKEND_TYPE is required, got: {BACKEND_TYPE}"
 77 | 
 78 |     LOAD_IN_8BIT = bool(strtobool(os.getenv("LOAD_IN_8BIT", "True")))
 79 | 
 80 |     if args.model_path != "":
 81 |         MODEL_PATH = args.model_path
 82 |     if args.backend_type != "":
 83 |         BACKEND_TYPE = args.backend_type
 84 |     if args.load_in_8bit:
 85 |         LOAD_IN_8BIT = True
 86 | 
 87 |     # Initialization
 88 |     init_tic = time.perf_counter()
 89 |     llama2_wrapper = LLAMA2_WRAPPER(
 90 |         model_path=MODEL_PATH,
 91 |         backend_type=BACKEND_TYPE,
 92 |         max_tokens=MAX_INPUT_TOKEN_LENGTH,
 93 |         load_in_8bit=LOAD_IN_8BIT,
 94 |         # verbose=True,
 95 |     )
 96 | 
 97 |     init_toc = time.perf_counter()
 98 |     initialization_time = init_toc - init_tic
 99 | 
100 |     total_time = 0
101 |     total_tokens_per_second = 0
102 |     total_memory_gen = 0
103 | 
104 |     prompt_example = (
105 |         "Can you explain briefly to me what is the Python programming language?"
106 |     )
107 | 
108 |     # Cold run
109 |     print("Performing cold run...")
110 |     run_iteration(
111 |         llama2_wrapper, prompt_example, DEFAULT_SYSTEM_PROMPT, DEFAULT_MAX_NEW_TOKENS
112 |     )
113 | 
114 |     # Timed runs
115 |     print(f"Performing {args.iter} timed runs...")
116 |     for i in tqdm(range(args.iter)):
117 |         try:
118 |             gen_time, tokens_per_sec, mem_gen, model_response = run_iteration(
119 |                 llama2_wrapper,
120 |                 prompt_example,
121 |                 DEFAULT_SYSTEM_PROMPT,
122 |                 DEFAULT_MAX_NEW_TOKENS,
123 |             )
124 |             total_time += gen_time
125 |             total_tokens_per_second += tokens_per_sec
126 |             total_memory_gen += mem_gen
127 |         except:
128 |             break
129 |     avg_time = total_time / (i + 1)
130 |     avg_tokens_per_second = total_tokens_per_second / (i + 1)
131 |     avg_memory_gen = total_memory_gen / (i + 1)
132 | 
133 |     print(f"Last model response: {model_response}")
134 |     print(f"Initialization time: {initialization_time:0.4f} seconds.")
135 |     print(
136 |         f"Average generation time over {(i + 1)} iterations: {avg_time:0.4f} seconds."
137 |     )
138 |     print(
139 |         f"Average speed over {(i + 1)} iterations: {avg_tokens_per_second:0.4f} tokens/sec."
140 |     )
141 |     print(f"Average memory usage during generation: {avg_memory_gen:.2f} MiB")
142 | 
143 | 
144 | if __name__ == "__main__":
145 |     main()
146 | 


--------------------------------------------------------------------------------
/docs/pypi.md:
--------------------------------------------------------------------------------
  1 | # llama2-wrapper
  2 | 
  3 | - Use [llama2-wrapper](https://pypi.org/project/llama2-wrapper/) as your local llama2 backend for Generative Agents/Apps, [colab example](https://github.com/liltom-eth/llama2-webui/blob/main/colab/Llama_2_7b_Chat_GPTQ.ipynb). 
  4 | 
  5 | - [Run OpenAI Compatible API](https://github.com/liltom-eth/llama2-webui#start-openai-compatible-api) on Llama2 models.
  6 | 
  7 | ## Features
  8 | 
  9 | - Supporting models: [Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)/[13b](https://huggingface.co/llamaste/Llama-2-13b-chat-hf)/[70b](https://huggingface.co/llamaste/Llama-2-70b-chat-hf), [Llama-2-GPTQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-GPTQ), [Llama-2-GGML](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML), [CodeLlama](https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GPTQ)...
 10 | - Supporting model backends: [tranformers](https://github.com/huggingface/transformers), [bitsandbytes(8-bit inference)](https://github.com/TimDettmers/bitsandbytes), [AutoGPTQ(4-bit inference)](https://github.com/PanQiWei/AutoGPTQ), [llama.cpp](https://github.com/ggerganov/llama.cpp)
 11 | - Demos: [Run Llama2 on MacBook Air](https://twitter.com/liltom_eth/status/1682791729207070720?s=20); [Run Llama2 on Colab T4 GPU](https://github.com/liltom-eth/llama2-webui/blob/main/colab/Llama_2_7b_Chat_GPTQ.ipynb)
 12 | - Use  [llama2-wrapper](https://pypi.org/project/llama2-wrapper/)  as your local llama2 backend for Generative Agents/Apps; [colab example](./colab/Llama_2_7b_Chat_GPTQ.ipynb).  
 13 | - [Run OpenAI Compatible API](https://github.com/liltom-eth/llama2-webui#start-openai-compatible-api) on Llama2 models.
 14 | - [News](https://github.com/liltom-eth/llama2-webui/blob/main/docs/news.md), [Benchmark](https://github.com/liltom-eth/llama2-webui/blob/main/docs/performance.md), [Issue Solutions](https://github.com/liltom-eth/llama2-webui/blob/main/docs/issues.md)
 15 | 
 16 | [llama2-wrapper](https://pypi.org/project/llama2-wrapper/)  is the backend and part of [llama2-webui](https://github.com/liltom-eth/llama2-webui), which can run any Llama 2 locally with gradio UI on GPU or CPU from anywhere (Linux/Windows/Mac).
 17 | 
 18 | ## Install
 19 | 
 20 | ```bash
 21 | pip install llama2-wrapper
 22 | ```
 23 | 
 24 | ## Start OpenAI Compatible  API
 25 | 
 26 | ```
 27 | python -m llama2_wrapper.server
 28 | ```
 29 | 
 30 | it will use `llama.cpp` as the backend by default to run `llama-2-7b-chat.ggmlv3.q4_0.bin` model.
 31 | 
 32 | Start Fast API for `gptq` backend:
 33 | 
 34 | ```
 35 | python -m llama2_wrapper.server --backend_type gptq
 36 | ```
 37 | 
 38 | Navigate to http://localhost:8000/docs to see the OpenAPI documentation.
 39 | 
 40 | ## API Usage
 41 | 
 42 | ###  `__call__`
 43 | 
 44 | `__call__()` is the function to generate text from a prompt. 
 45 | 
 46 | For example, run ggml llama2 model on CPU, [colab example](https://github.com/liltom-eth/llama2-webui/blob/main/colab/ggmlv3_q4_0.ipynb):
 47 | 
 48 | ```python
 49 | from llama2_wrapper import LLAMA2_WRAPPER, get_prompt 
 50 | llama2_wrapper = LLAMA2_WRAPPER()
 51 | # Default running on backend llama.cpp.
 52 | # Automatically downloading model to: ./models/llama-2-7b-chat.ggmlv3.q4_0.bin
 53 | prompt = "Do you know Pytorch"
 54 | # llama2_wrapper() will run __call__()
 55 | answer = llama2_wrapper(get_prompt(prompt), temperature=0.9)
 56 | ```
 57 | 
 58 | Run gptq llama2 model on Nvidia GPU, [colab example](https://github.com/liltom-eth/llama2-webui/blob/main/colab/Llama_2_7b_Chat_GPTQ.ipynb):
 59 | 
 60 | ```python
 61 | from llama2_wrapper import LLAMA2_WRAPPER 
 62 | llama2_wrapper = LLAMA2_WRAPPER(backend_type="gptq")
 63 | # Automatically downloading model to: ./models/Llama-2-7b-Chat-GPTQ
 64 | ```
 65 | 
 66 | Run llama2 7b with bitsandbytes 8 bit with a `model_path`:
 67 | 
 68 | ```python
 69 | from llama2_wrapper import LLAMA2_WRAPPER 
 70 | llama2_wrapper = LLAMA2_WRAPPER(
 71 | 	model_path = "./models/Llama-2-7b-chat-hf",
 72 |   backend_type = "transformers",
 73 |   load_in_8bit = True
 74 | )
 75 | ```
 76 | 
 77 | ### completion
 78 | 
 79 |   `completion()`  is the function to generate text from a prompt for OpenAI compatible API `/v1/completions`.
 80 | 
 81 | ```python
 82 | llama2_wrapper = LLAMA2_WRAPPER()
 83 | prompt = get_prompt("Hi do you know Pytorch?")
 84 | print(llm.completion(prompt))
 85 | ```
 86 | 
 87 | ### chat_completion
 88 | 
 89 |   `chat_completion()`  is the function to generate text from a dialog (chat history) for OpenAI compatible API `/v1/chat/completions`.
 90 | 
 91 | ```python
 92 | llama2_wrapper = LLAMA2_WRAPPER()
 93 | dialog = [
 94 |     {
 95 |         "role":"system",
 96 |         "content":"You are a helpful, respectful and honest assistant. "
 97 |     },{
 98 |         "role":"user",
 99 |         "content":"Hi do you know Pytorch?",
100 |     },
101 | ]
102 | print(llm.chat_completion(dialog))
103 | ```
104 | 
105 | ### generate
106 | 
107 | `generate()` is the function to create a generator of response from a prompt.
108 | 
109 | This is useful when you want to stream the output like typing in the chatbot.
110 | 
111 | ```python
112 | llama2_wrapper = LLAMA2_WRAPPER()
113 | prompt = get_prompt("Hi do you know Pytorch?")
114 | for response in llama2_wrapper.generate(prompt):
115 | 	print(response)
116 | 
117 | ```
118 | 
119 | The response will be like:
120 | 
121 | ```
122 | Yes, 
123 | Yes, I'm 
124 | Yes, I'm familiar 
125 | Yes, I'm familiar with 
126 | Yes, I'm familiar with PyTorch! 
127 | ...
128 | ```
129 | 
130 | ### run
131 | 
132 | `run()` is similar to `generate()`, but `run()`can also accept `chat_history`and `system_prompt` from the users.
133 | 
134 | It will process the input message to llama2 prompt template with `chat_history` and `system_prompt` for a chatbot-like app.
135 | 
136 | ### get_prompt
137 | 
138 | `get_prompt()` will process the input message to llama2 prompt with `chat_history` and `system_prompt`for chatbot.
139 | 
140 | By default, `chat_history` and `system_prompt` are empty and `get_prompt()` will add llama2 prompt template to your message:
141 | 
142 | ```python
143 | prompt = get_prompt("Hi do you know Pytorch?")
144 | ```
145 | 
146 | prompt will be:
147 | 
148 | ```
149 | [INST] <<SYS>>
150 | 
151 | <</SYS>>
152 | 
153 | Hi do you know Pytorch? [/INST]
154 | ```
155 | 
156 | If use `get_prompt("Hi do you know Pytorch?", system_prompt="You are a helpful...")`:
157 | 
158 | ```
159 | [INST] <<SYS>>
160 | You are a helpful, respectful and honest assistant. 
161 | <</SYS>>
162 | 
163 | Hi do you know Pytorch? [/INST]
164 | ```
165 | 
166 | ### get_prompt_for_dialog
167 | 
168 | `get_prompt_for_dialog()` will process dialog (chat history) to llama2 prompt for OpenAI compatible API `/v1/chat/completions`.
169 | 
170 | ```python
171 | dialog = [
172 |     {
173 |         "role":"system",
174 |         "content":"You are a helpful, respectful and honest assistant. "
175 |     },{
176 |         "role":"user",
177 |         "content":"Hi do you know Pytorch?",
178 |     },
179 | ]
180 | prompt = get_prompt_for_dialog("Hi do you know Pytorch?")
181 | # [INST] <<SYS>>
182 | # You are a helpful, respectful and honest assistant. 
183 | # <</SYS>>
184 | # 
185 | # Hi do you know Pytorch? [/INST]
186 | ```
187 | 
188 | 


--------------------------------------------------------------------------------
/code_completion.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | import gradio as gr
  4 | from llama2_wrapper import LLAMA2_WRAPPER
  5 | 
  6 | FIM_PREFIX = "<PRE> "
  7 | FIM_MIDDLE = " <MID>"
  8 | FIM_SUFFIX = " <SUF>"
  9 | 
 10 | FIM_INDICATOR = "<FILL_ME>"
 11 | 
 12 | EOS_STRING = "</s>"
 13 | EOT_STRING = "<EOT>"
 14 | 
 15 | 
 16 | def main():
 17 |     parser = argparse.ArgumentParser()
 18 |     parser.add_argument(
 19 |         "--model_path",
 20 |         type=str,
 21 |         default="./models/codellama-7b-instruct.ggmlv3.Q4_0.bin",
 22 |         help="model path",
 23 |     )
 24 |     parser.add_argument(
 25 |         "--backend_type",
 26 |         type=str,
 27 |         default="llama.cpp",
 28 |         help="Backend options: llama.cpp, gptq, transformers",
 29 |     )
 30 |     parser.add_argument(
 31 |         "--max_tokens",
 32 |         type=int,
 33 |         default=4000,
 34 |         help="Maximum context size.",
 35 |     )
 36 |     parser.add_argument(
 37 |         "--load_in_8bit",
 38 |         type=bool,
 39 |         default=False,
 40 |         help="Whether to use bitsandbytes 8 bit.",
 41 |     )
 42 |     parser.add_argument(
 43 |         "--share",
 44 |         type=bool,
 45 |         default=False,
 46 |         help="Whether to share public for gradio.",
 47 |     )
 48 |     args = parser.parse_args()
 49 | 
 50 |     llama2_wrapper = LLAMA2_WRAPPER(
 51 |         model_path=args.model_path,
 52 |         backend_type=args.backend_type,
 53 |         max_tokens=args.max_tokens,
 54 |         load_in_8bit=args.load_in_8bit,
 55 |     )
 56 | 
 57 |     def generate(
 58 |         prompt,
 59 |         temperature=0.9,
 60 |         max_new_tokens=256,
 61 |         top_p=0.95,
 62 |         repetition_penalty=1.0,
 63 |     ):
 64 |         temperature = float(temperature)
 65 |         if temperature < 1e-2:
 66 |             temperature = 1e-2
 67 |         top_p = float(top_p)
 68 |         fim_mode = False
 69 | 
 70 |         generate_kwargs = dict(
 71 |             temperature=temperature,
 72 |             max_new_tokens=max_new_tokens,
 73 |             top_p=top_p,
 74 |             repetition_penalty=repetition_penalty,
 75 |             stream=True,
 76 |         )
 77 | 
 78 |         if FIM_INDICATOR in prompt:
 79 |             fim_mode = True
 80 |             try:
 81 |                 prefix, suffix = prompt.split(FIM_INDICATOR)
 82 |             except:
 83 |                 raise ValueError(f"Only one {FIM_INDICATOR} allowed in prompt!")
 84 |             prompt = f"{FIM_PREFIX}{prefix}{FIM_SUFFIX}{suffix}{FIM_MIDDLE}"
 85 | 
 86 |         stream = llama2_wrapper.__call__(prompt, **generate_kwargs)
 87 | 
 88 |         if fim_mode:
 89 |             output = prefix
 90 |         else:
 91 |             output = prompt
 92 | 
 93 |         # for response in stream:
 94 |         #     output += response
 95 |         #     yield output
 96 |         # return output
 97 | 
 98 |         previous_token = ""
 99 |         for response in stream:
100 |             if any([end_token in response for end_token in [EOS_STRING, EOT_STRING]]):
101 |                 if fim_mode:
102 |                     output += suffix
103 |                     yield output
104 |                     return output
105 |                     print("output", output)
106 |                 else:
107 |                     return output
108 |             else:
109 |                 output += response
110 |             previous_token = response
111 |             yield output
112 |         return output
113 | 
114 |     examples = [
115 |         'def remove_non_ascii(s: str) -> str:\n    """ <FILL_ME>\nprint(remove_non_ascii(\'afkdj$$(\'))',
116 |         "X_train, y_train, X_test, y_test = train_test_split(X, y, test_size=0.1)\n\n# Train a logistic regression model, predict the labels on the test set and compute the accuracy score",
117 |         "// Returns every other value in the array as a new array.\nfunction everyOther(arr) {",
118 |         "Poor English: She no went to the market. Corrected English:",
119 |         "def alternating(list1, list2):\n   results = []\n   for i in range(min(len(list1), len(list2))):\n       results.append(list1[i])\n       results.append(list2[i])\n   if len(list1) > len(list2):\n       <FILL_ME>\n   else:\n       results.extend(list2[i+1:])\n   return results",
120 |     ]
121 | 
122 |     def process_example(args):
123 |         for x in generate(args):
124 |             pass
125 |         return x
126 | 
127 |     description = """
128 |     <div style="text-align: center;">
129 |         <h1>Code Llama Playground</h1>
130 |     
131 |     </div>
132 |     <div style="text-align: center;">
133 |         <p>This is a demo to complete code with Code Llama. For instruction purposes, please use llama2-webui app.py with CodeLlama-Instruct models. </p>
134 |     </div>
135 |     """
136 |     with gr.Blocks() as demo:
137 |         with gr.Column():
138 |             gr.Markdown(description)
139 |             with gr.Row():
140 |                 with gr.Column():
141 |                     instruction = gr.Textbox(
142 |                         placeholder="Enter your code here",
143 |                         lines=5,
144 |                         label="Input",
145 |                         elem_id="q-input",
146 |                     )
147 |                     submit = gr.Button("Generate", variant="primary")
148 |                     output = gr.Code(elem_id="q-output", lines=30, label="Output")
149 |                     with gr.Row():
150 |                         with gr.Column():
151 |                             with gr.Accordion("Advanced settings", open=False):
152 |                                 with gr.Row():
153 |                                     column_1, column_2 = gr.Column(), gr.Column()
154 |                                     with column_1:
155 |                                         temperature = gr.Slider(
156 |                                             label="Temperature",
157 |                                             value=0.1,
158 |                                             minimum=0.0,
159 |                                             maximum=1.0,
160 |                                             step=0.05,
161 |                                             interactive=True,
162 |                                             info="Higher values produce more diverse outputs",
163 |                                         )
164 |                                         max_new_tokens = gr.Slider(
165 |                                             label="Max new tokens",
166 |                                             value=256,
167 |                                             minimum=0,
168 |                                             maximum=8192,
169 |                                             step=64,
170 |                                             interactive=True,
171 |                                             info="The maximum numbers of new tokens",
172 |                                         )
173 |                                     with column_2:
174 |                                         top_p = gr.Slider(
175 |                                             label="Top-p (nucleus sampling)",
176 |                                             value=0.90,
177 |                                             minimum=0.0,
178 |                                             maximum=1,
179 |                                             step=0.05,
180 |                                             interactive=True,
181 |                                             info="Higher values sample more low-probability tokens",
182 |                                         )
183 |                                         repetition_penalty = gr.Slider(
184 |                                             label="Repetition penalty",
185 |                                             value=1.05,
186 |                                             minimum=1.0,
187 |                                             maximum=2.0,
188 |                                             step=0.05,
189 |                                             interactive=True,
190 |                                             info="Penalize repeated tokens",
191 |                                         )
192 | 
193 |                     gr.Examples(
194 |                         examples=examples,
195 |                         inputs=[instruction],
196 |                         cache_examples=False,
197 |                         fn=process_example,
198 |                         outputs=[output],
199 |                     )
200 | 
201 |         submit.click(
202 |             generate,
203 |             inputs=[
204 |                 instruction,
205 |                 temperature,
206 |                 max_new_tokens,
207 |                 top_p,
208 |                 repetition_penalty,
209 |             ],
210 |             outputs=[output],
211 |         )
212 |     demo.queue(concurrency_count=16).launch(share=args.share)
213 | 
214 | 
215 | if __name__ == "__main__":
216 |     main()
217 | 


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | from typing import Iterator
  4 | 
  5 | import gradio as gr
  6 | from dotenv import load_dotenv
  7 | from distutils.util import strtobool
  8 | 
  9 | from llama2_wrapper import LLAMA2_WRAPPER
 10 | 
 11 | import logging
 12 | 
 13 | from prompts.utils import PromtsContainer
 14 | 
 15 | 
 16 | def main():
 17 |     parser = argparse.ArgumentParser()
 18 |     parser.add_argument("--model_path", type=str, default="", help="model path")
 19 |     parser.add_argument(
 20 |         "--backend_type",
 21 |         type=str,
 22 |         default="",
 23 |         help="Backend options: llama.cpp, gptq, transformers",
 24 |     )
 25 |     parser.add_argument(
 26 |         "--load_in_8bit",
 27 |         type=bool,
 28 |         default=False,
 29 |         help="Whether to use bitsandbytes 8 bit.",
 30 |     )
 31 |     parser.add_argument(
 32 |         "--share",
 33 |         type=bool,
 34 |         default=False,
 35 |         help="Whether to share public for gradio.",
 36 |     )
 37 |     args = parser.parse_args()
 38 | 
 39 |     load_dotenv()
 40 | 
 41 |     DEFAULT_SYSTEM_PROMPT = os.getenv("DEFAULT_SYSTEM_PROMPT", "")
 42 |     MAX_MAX_NEW_TOKENS = int(os.getenv("MAX_MAX_NEW_TOKENS", 2048))
 43 |     DEFAULT_MAX_NEW_TOKENS = int(os.getenv("DEFAULT_MAX_NEW_TOKENS", 1024))
 44 |     MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", 4000))
 45 | 
 46 |     MODEL_PATH = os.getenv("MODEL_PATH")
 47 |     assert MODEL_PATH is not None, f"MODEL_PATH is required, got: {MODEL_PATH}"
 48 |     BACKEND_TYPE = os.getenv("BACKEND_TYPE")
 49 |     assert BACKEND_TYPE is not None, f"BACKEND_TYPE is required, got: {BACKEND_TYPE}"
 50 | 
 51 |     LOAD_IN_8BIT = bool(strtobool(os.getenv("LOAD_IN_8BIT", "True")))
 52 | 
 53 |     if args.model_path != "":
 54 |         MODEL_PATH = args.model_path
 55 |     if args.backend_type != "":
 56 |         BACKEND_TYPE = args.backend_type
 57 |     if args.load_in_8bit:
 58 |         LOAD_IN_8BIT = True
 59 | 
 60 |     llama2_wrapper = LLAMA2_WRAPPER(
 61 |         model_path=MODEL_PATH,
 62 |         backend_type=BACKEND_TYPE,
 63 |         max_tokens=MAX_INPUT_TOKEN_LENGTH,
 64 |         load_in_8bit=LOAD_IN_8BIT,
 65 |         # verbose=True,
 66 |     )
 67 | 
 68 |     DESCRIPTION = """
 69 |     # llama2-webui
 70 |     """
 71 |     DESCRIPTION2 = """
 72 |     - Supporting models: [Llama-2-7b](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML)/[13b](https://huggingface.co/llamaste/Llama-2-13b-chat-hf)/[70b](https://huggingface.co/llamaste/Llama-2-70b-chat-hf), [Llama-2-GPTQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-GPTQ), [Llama-2-GGML](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML), [CodeLlama](https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GPTQ) ...
 73 |     - Supporting model backends: [tranformers](https://github.com/huggingface/transformers), [bitsandbytes(8-bit inference)](https://github.com/TimDettmers/bitsandbytes), [AutoGPTQ(4-bit inference)](https://github.com/PanQiWei/AutoGPTQ), [llama.cpp](https://github.com/ggerganov/llama.cpp)
 74 |     """
 75 | 
 76 |     def clear_and_save_textbox(message: str) -> tuple[str, str]:
 77 |         return "", message
 78 | 
 79 |     def save_textbox_for_prompt(message: str) -> str:
 80 |         logging.info("start save_textbox_from_prompt")
 81 |         message = convert_summary_to_prompt(message)
 82 |         return message
 83 | 
 84 |     def display_input(
 85 |         message: str, history: list[tuple[str, str]]
 86 |     ) -> list[tuple[str, str]]:
 87 |         history.append((message, ""))
 88 |         return history
 89 | 
 90 |     def delete_prev_fn(
 91 |         history: list[tuple[str, str]]
 92 |     ) -> tuple[list[tuple[str, str]], str]:
 93 |         try:
 94 |             message, _ = history.pop()
 95 |         except IndexError:
 96 |             message = ""
 97 |         return history, message or ""
 98 | 
 99 |     def generate(
100 |         message: str,
101 |         history_with_input: list[tuple[str, str]],
102 |         system_prompt: str,
103 |         max_new_tokens: int,
104 |         temperature: float,
105 |         top_p: float,
106 |         top_k: int,
107 |     ) -> Iterator[list[tuple[str, str]]]:
108 |         if max_new_tokens > MAX_MAX_NEW_TOKENS:
109 |             raise ValueError
110 |         try:
111 |             history = history_with_input[:-1]
112 |             generator = llama2_wrapper.run(
113 |                 message,
114 |                 history,
115 |                 system_prompt,
116 |                 max_new_tokens,
117 |                 temperature,
118 |                 top_p,
119 |                 top_k,
120 |             )
121 |             try:
122 |                 first_response = next(generator)
123 |                 yield history + [(message, first_response)]
124 |             except StopIteration:
125 |                 yield history + [(message, "")]
126 |             for response in generator:
127 |                 yield history + [(message, response)]
128 |         except Exception as e:
129 |             logging.exception(e)
130 | 
131 |     def check_input_token_length(
132 |         message: str, chat_history: list[tuple[str, str]], system_prompt: str
133 |     ) -> None:
134 |         input_token_length = llama2_wrapper.get_input_token_length(
135 |             message, chat_history, system_prompt
136 |         )
137 |         if input_token_length > MAX_INPUT_TOKEN_LENGTH:
138 |             raise gr.Error(
139 |                 f"The accumulated input is too long ({input_token_length} > {MAX_INPUT_TOKEN_LENGTH}). Clear your chat history and try again."
140 |             )
141 | 
142 |     prompts_container = PromtsContainer()
143 |     prompts = prompts_container.get_prompts_tab_dict()
144 |     default_prompts_checkbox = False
145 |     default_advanced_checkbox = False
146 | 
147 |     def convert_summary_to_prompt(summary):
148 |         return prompts_container.get_prompt_by_summary(summary)
149 | 
150 |     def two_columns_list(tab_data, chatbot):
151 |         result = []
152 |         for i in range(int(len(tab_data) / 2) + 1):
153 |             row = gr.Row()
154 |             with row:
155 |                 for j in range(2):
156 |                     index = 2 * i + j
157 |                     if index >= len(tab_data):
158 |                         break
159 |                     item = tab_data[index]
160 |                     with gr.Group():
161 |                         gr.HTML(
162 |                             f'<p style="color: black; font-weight: bold;">{item["act"]}</p>'
163 |                         )
164 |                         prompt_text = gr.Button(
165 |                             label="",
166 |                             value=f"{item['summary']}",
167 |                             size="sm",
168 |                             elem_classes="text-left-aligned",
169 |                         )
170 |                         prompt_text.click(
171 |                             fn=save_textbox_for_prompt,
172 |                             inputs=prompt_text,
173 |                             outputs=saved_input,
174 |                             api_name=False,
175 |                             queue=True,
176 |                         ).then(
177 |                             fn=display_input,
178 |                             inputs=[saved_input, chatbot],
179 |                             outputs=chatbot,
180 |                             api_name=False,
181 |                             queue=True,
182 |                         ).then(
183 |                             fn=check_input_token_length,
184 |                             inputs=[saved_input, chatbot, system_prompt],
185 |                             api_name=False,
186 |                             queue=False,
187 |                         ).success(
188 |                             fn=generate,
189 |                             inputs=[
190 |                                 saved_input,
191 |                                 chatbot,
192 |                                 system_prompt,
193 |                                 max_new_tokens,
194 |                                 temperature,
195 |                                 top_p,
196 |                                 top_k,
197 |                             ],
198 |                             outputs=chatbot,
199 |                             api_name=False,
200 |                         )
201 |                 result.append(row)
202 |         return result
203 | 
204 |     CSS = """
205 |         .contain { display: flex; flex-direction: column;}
206 |         #component-0 #component-1 #component-2 #component-4 #component-5 { height:71vh !important; }
207 |         #component-0 #component-1 #component-24 > div:nth-child(2) { height:80vh !important; overflow-y:auto }
208 |         .text-left-aligned {text-align: left !important; font-size: 16px;}
209 |     """
210 |     with gr.Blocks(css=CSS) as demo:
211 |         with gr.Row(equal_height=True):
212 |             with gr.Column(scale=2):
213 |                 gr.Markdown(DESCRIPTION)
214 |                 with gr.Group():
215 |                     chatbot = gr.Chatbot(label="Chatbot")
216 |                     with gr.Row():
217 |                         textbox = gr.Textbox(
218 |                             container=False,
219 |                             show_label=False,
220 |                             placeholder="Type a message...",
221 |                             scale=10,
222 |                         )
223 |                         submit_button = gr.Button(
224 |                             "Submit", variant="primary", scale=1, min_width=0
225 |                         )
226 |                 with gr.Row():
227 |                     retry_button = gr.Button("🔄  Retry", variant="secondary")
228 |                     undo_button = gr.Button("↩️ Undo", variant="secondary")
229 |                     clear_button = gr.Button("🗑️  Clear", variant="secondary")
230 | 
231 |                 saved_input = gr.State()
232 |                 with gr.Row():
233 |                     advanced_checkbox = gr.Checkbox(
234 |                         label="Advanced",
235 |                         value=default_prompts_checkbox,
236 |                         container=False,
237 |                         elem_classes="min_check",
238 |                     )
239 |                     prompts_checkbox = gr.Checkbox(
240 |                         label="Prompts",
241 |                         value=default_prompts_checkbox,
242 |                         container=False,
243 |                         elem_classes="min_check",
244 |                     )
245 |                 with gr.Column(visible=default_advanced_checkbox) as advanced_column:
246 |                     system_prompt = gr.Textbox(
247 |                         label="System prompt", value=DEFAULT_SYSTEM_PROMPT, lines=6
248 |                     )
249 |                     max_new_tokens = gr.Slider(
250 |                         label="Max new tokens",
251 |                         minimum=1,
252 |                         maximum=MAX_MAX_NEW_TOKENS,
253 |                         step=1,
254 |                         value=DEFAULT_MAX_NEW_TOKENS,
255 |                     )
256 |                     temperature = gr.Slider(
257 |                         label="Temperature",
258 |                         minimum=0.1,
259 |                         maximum=4.0,
260 |                         step=0.1,
261 |                         value=1.0,
262 |                     )
263 |                     top_p = gr.Slider(
264 |                         label="Top-p (nucleus sampling)",
265 |                         minimum=0.05,
266 |                         maximum=1.0,
267 |                         step=0.05,
268 |                         value=0.95,
269 |                     )
270 |                     top_k = gr.Slider(
271 |                         label="Top-k",
272 |                         minimum=1,
273 |                         maximum=1000,
274 |                         step=1,
275 |                         value=50,
276 |                     )
277 |             with gr.Column(scale=1, visible=default_prompts_checkbox) as prompt_column:
278 |                 gr.HTML(
279 |                     '<p style="color: green; font-weight: bold;font-size: 16px;">\N{four leaf clover} prompts</p>'
280 |                 )
281 |                 for k, v in prompts.items():
282 |                     with gr.Tab(k, scroll_to_output=True):
283 |                         lst = two_columns_list(v, chatbot)
284 |             prompts_checkbox.change(
285 |                 lambda x: gr.update(visible=x),
286 |                 prompts_checkbox,
287 |                 prompt_column,
288 |                 queue=False,
289 |             )
290 |             advanced_checkbox.change(
291 |                 lambda x: gr.update(visible=x),
292 |                 advanced_checkbox,
293 |                 advanced_column,
294 |                 queue=False,
295 |             )
296 | 
297 |         textbox.submit(
298 |             fn=clear_and_save_textbox,
299 |             inputs=textbox,
300 |             outputs=[textbox, saved_input],
301 |             api_name=False,
302 |             queue=False,
303 |         ).then(
304 |             fn=display_input,
305 |             inputs=[saved_input, chatbot],
306 |             outputs=chatbot,
307 |             api_name=False,
308 |             queue=False,
309 |         ).then(
310 |             fn=check_input_token_length,
311 |             inputs=[saved_input, chatbot, system_prompt],
312 |             api_name=False,
313 |             queue=False,
314 |         ).success(
315 |             fn=generate,
316 |             inputs=[
317 |                 saved_input,
318 |                 chatbot,
319 |                 system_prompt,
320 |                 max_new_tokens,
321 |                 temperature,
322 |                 top_p,
323 |                 top_k,
324 |             ],
325 |             outputs=chatbot,
326 |             api_name=False,
327 |         )
328 | 
329 |         button_event_preprocess = (
330 |             submit_button.click(
331 |                 fn=clear_and_save_textbox,
332 |                 inputs=textbox,
333 |                 outputs=[textbox, saved_input],
334 |                 api_name=False,
335 |                 queue=False,
336 |             )
337 |             .then(
338 |                 fn=display_input,
339 |                 inputs=[saved_input, chatbot],
340 |                 outputs=chatbot,
341 |                 api_name=False,
342 |                 queue=False,
343 |             )
344 |             .then(
345 |                 fn=check_input_token_length,
346 |                 inputs=[saved_input, chatbot, system_prompt],
347 |                 api_name=False,
348 |                 queue=False,
349 |             )
350 |             .success(
351 |                 fn=generate,
352 |                 inputs=[
353 |                     saved_input,
354 |                     chatbot,
355 |                     system_prompt,
356 |                     max_new_tokens,
357 |                     temperature,
358 |                     top_p,
359 |                     top_k,
360 |                 ],
361 |                 outputs=chatbot,
362 |                 api_name=False,
363 |             )
364 |         )
365 | 
366 |         retry_button.click(
367 |             fn=delete_prev_fn,
368 |             inputs=chatbot,
369 |             outputs=[chatbot, saved_input],
370 |             api_name=False,
371 |             queue=False,
372 |         ).then(
373 |             fn=display_input,
374 |             inputs=[saved_input, chatbot],
375 |             outputs=chatbot,
376 |             api_name=False,
377 |             queue=False,
378 |         ).then(
379 |             fn=generate,
380 |             inputs=[
381 |                 saved_input,
382 |                 chatbot,
383 |                 system_prompt,
384 |                 max_new_tokens,
385 |                 temperature,
386 |                 top_p,
387 |                 top_k,
388 |             ],
389 |             outputs=chatbot,
390 |             api_name=False,
391 |         )
392 | 
393 |         undo_button.click(
394 |             fn=delete_prev_fn,
395 |             inputs=chatbot,
396 |             outputs=[chatbot, saved_input],
397 |             api_name=False,
398 |             queue=False,
399 |         ).then(
400 |             fn=lambda x: x,
401 |             inputs=[saved_input],
402 |             outputs=textbox,
403 |             api_name=False,
404 |             queue=False,
405 |         )
406 | 
407 |         clear_button.click(
408 |             fn=lambda: ([], ""),
409 |             outputs=[chatbot, saved_input],
410 |             queue=False,
411 |             api_name=False,
412 |         )
413 | 
414 |     demo.queue(max_size=20).launch(share=args.share)
415 | 
416 | 
417 | if __name__ == "__main__":
418 |     main()
419 | 


--------------------------------------------------------------------------------
/llama2_wrapper/server/app.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import multiprocessing
  3 | from re import compile, Match, Pattern
  4 | from threading import Lock
  5 | from functools import partial
  6 | from typing import Callable, Coroutine, Iterator, List, Optional, Tuple, Union, Dict
  7 | from typing_extensions import TypedDict, Literal
  8 | 
  9 | import anyio
 10 | from anyio.streams.memory import MemoryObjectSendStream
 11 | from starlette.concurrency import run_in_threadpool, iterate_in_threadpool
 12 | from fastapi import Depends, FastAPI, APIRouter, Request, Response
 13 | from fastapi.middleware.cors import CORSMiddleware
 14 | from fastapi.responses import JSONResponse
 15 | from fastapi.routing import APIRoute
 16 | from pydantic import BaseModel, Field
 17 | from pydantic_settings import BaseSettings
 18 | from sse_starlette.sse import EventSourceResponse
 19 | 
 20 | from llama2_wrapper.model import LLAMA2_WRAPPER
 21 | from llama2_wrapper.types import (
 22 |     Completion,
 23 |     CompletionChunk,
 24 |     ChatCompletion,
 25 |     ChatCompletionChunk,
 26 | )
 27 | 
 28 | 
 29 | class Settings(BaseSettings):
 30 |     model_path: str = Field(
 31 |         default="",
 32 |         description="The path to the model to use for generating completions.",
 33 |     )
 34 |     backend_type: str = Field(
 35 |         default="llama.cpp",
 36 |         description="Backend for llama2, options: llama.cpp, gptq, transformers",
 37 |     )
 38 |     max_tokens: int = Field(default=4000, ge=1, description="Maximum context size.")
 39 |     load_in_8bit: bool = Field(
 40 |         default=False,
 41 |         description="`Whether to use bitsandbytes to run model in 8 bit mode (only for transformers models).",
 42 |     )
 43 |     verbose: bool = Field(
 44 |         default=False,
 45 |         description="Whether to print verbose output to stderr.",
 46 |     )
 47 |     host: str = Field(default="localhost", description="API address")
 48 |     port: int = Field(default=8000, description="API port")
 49 |     interrupt_requests: bool = Field(
 50 |         default=True,
 51 |         description="Whether to interrupt requests when a new request is received.",
 52 |     )
 53 | 
 54 | 
 55 | class ErrorResponse(TypedDict):
 56 |     """OpenAI style error response"""
 57 | 
 58 |     message: str
 59 |     type: str
 60 |     param: Optional[str]
 61 |     code: Optional[str]
 62 | 
 63 | 
 64 | class ErrorResponseFormatters:
 65 |     """Collection of formatters for error responses.
 66 | 
 67 |     Args:
 68 |         request (Union[CreateCompletionRequest, CreateChatCompletionRequest]):
 69 |             Request body
 70 |         match (Match[str]): Match object from regex pattern
 71 | 
 72 |     Returns:
 73 |         Tuple[int, ErrorResponse]: Status code and error response
 74 |     """
 75 | 
 76 |     @staticmethod
 77 |     def context_length_exceeded(
 78 |         request: Union["CreateCompletionRequest", "CreateChatCompletionRequest"],
 79 |         match,  # type: Match[str] # type: ignore
 80 |     ) -> Tuple[int, ErrorResponse]:
 81 |         """Formatter for context length exceeded error"""
 82 | 
 83 |         context_window = int(match.group(2))
 84 |         prompt_tokens = int(match.group(1))
 85 |         completion_tokens = request.max_new_tokens
 86 |         if hasattr(request, "messages"):
 87 |             # Chat completion
 88 |             message = (
 89 |                 "This model's maximum context length is {} tokens. "
 90 |                 "However, you requested {} tokens "
 91 |                 "({} in the messages, {} in the completion). "
 92 |                 "Please reduce the length of the messages or completion."
 93 |             )
 94 |         else:
 95 |             # Text completion
 96 |             message = (
 97 |                 "This model's maximum context length is {} tokens, "
 98 |                 "however you requested {} tokens "
 99 |                 "({} in your prompt; {} for the completion). "
100 |                 "Please reduce your prompt; or completion length."
101 |             )
102 |         return 400, ErrorResponse(
103 |             message=message.format(
104 |                 context_window,
105 |                 completion_tokens + prompt_tokens,
106 |                 prompt_tokens,
107 |                 completion_tokens,
108 |             ),
109 |             type="invalid_request_error",
110 |             param="messages",
111 |             code="context_length_exceeded",
112 |         )
113 | 
114 |     @staticmethod
115 |     def model_not_found(
116 |         request: Union["CreateCompletionRequest", "CreateChatCompletionRequest"],
117 |         match,  # type: Match[str] # type: ignore
118 |     ) -> Tuple[int, ErrorResponse]:
119 |         """Formatter for model_not_found error"""
120 | 
121 |         model_path = str(match.group(1))
122 |         message = f"The model `{model_path}` does not exist"
123 |         return 400, ErrorResponse(
124 |             message=message,
125 |             type="invalid_request_error",
126 |             param=None,
127 |             code="model_not_found",
128 |         )
129 | 
130 | 
131 | class RouteErrorHandler(APIRoute):
132 |     """Custom APIRoute that handles application errors and exceptions"""
133 | 
134 |     # key: regex pattern for original error message from llama_cpp
135 |     # value: formatter function
136 |     pattern_and_formatters: Dict[
137 |         "Pattern",
138 |         Callable[
139 |             [
140 |                 Union["CreateCompletionRequest", "CreateChatCompletionRequest"],
141 |                 "Match[str]",
142 |             ],
143 |             Tuple[int, ErrorResponse],
144 |         ],
145 |     ] = {
146 |         compile(
147 |             r"Requested tokens \((\d+)\) exceed context window of (\d+)"
148 |         ): ErrorResponseFormatters.context_length_exceeded,
149 |         compile(
150 |             r"Model path does not exist: (.+)"
151 |         ): ErrorResponseFormatters.model_not_found,
152 |     }
153 | 
154 |     def error_message_wrapper(
155 |         self,
156 |         error: Exception,
157 |         body: Optional[
158 |             Union[
159 |                 "CreateChatCompletionRequest",
160 |                 "CreateCompletionRequest",
161 |             ]
162 |         ] = None,
163 |     ) -> Tuple[int, ErrorResponse]:
164 |         """Wraps error message in OpenAI style error response"""
165 | 
166 |         if body is not None and isinstance(
167 |             body,
168 |             (
169 |                 CreateCompletionRequest,
170 |                 CreateChatCompletionRequest,
171 |             ),
172 |         ):
173 |             # When text completion or chat completion
174 |             for pattern, callback in self.pattern_and_formatters.items():
175 |                 match = pattern.search(str(error))
176 |                 if match is not None:
177 |                     return callback(body, match)
178 | 
179 |         # Wrap other errors as internal server error
180 |         return 500, ErrorResponse(
181 |             message=str(error),
182 |             type="internal_server_error",
183 |             param=None,
184 |             code=None,
185 |         )
186 | 
187 |     def get_route_handler(
188 |         self,
189 |     ) -> Callable[[Request], Coroutine[None, None, Response]]:
190 |         """Defines custom route handler that catches exceptions and formats
191 |         in OpenAI style error response"""
192 | 
193 |         original_route_handler = super().get_route_handler()
194 | 
195 |         async def custom_route_handler(request: Request) -> Response:
196 |             try:
197 |                 return await original_route_handler(request)
198 |             except Exception as exc:
199 |                 json_body = await request.json()
200 |                 try:
201 |                     if "messages" in json_body:
202 |                         # Chat completion
203 |                         body: Optional[
204 |                             Union[
205 |                                 CreateChatCompletionRequest,
206 |                                 CreateCompletionRequest,
207 |                             ]
208 |                         ] = CreateChatCompletionRequest(**json_body)
209 |                     elif "prompt" in json_body:
210 |                         # Text completion
211 |                         body = CreateCompletionRequest(**json_body)
212 |                     # else:
213 |                     #     # Embedding
214 |                     #     body = CreateEmbeddingRequest(**json_body)
215 |                 except Exception:
216 |                     # Invalid request body
217 |                     body = None
218 | 
219 |                 # Get proper error message from the exception
220 |                 (
221 |                     status_code,
222 |                     error_message,
223 |                 ) = self.error_message_wrapper(error=exc, body=body)
224 |                 return JSONResponse(
225 |                     {"error": error_message},
226 |                     status_code=status_code,
227 |                 )
228 | 
229 |         return custom_route_handler
230 | 
231 | 
232 | router = APIRouter(route_class=RouteErrorHandler)
233 | 
234 | settings: Optional[Settings] = None
235 | llama2: Optional[LLAMA2_WRAPPER] = None
236 | 
237 | 
238 | def create_app(settings: Optional[Settings] = None):
239 |     if settings is None:
240 |         settings = Settings()
241 |     app = FastAPI(
242 |         title="llama2-wrapper Fast API",
243 |         version="0.0.1",
244 |     )
245 |     app.add_middleware(
246 |         CORSMiddleware,
247 |         allow_origins=["*"],
248 |         allow_credentials=True,
249 |         allow_methods=["*"],
250 |         allow_headers=["*"],
251 |     )
252 |     app.include_router(router)
253 |     global llama2
254 |     llama2 = LLAMA2_WRAPPER(
255 |         model_path=settings.model_path,
256 |         backend_type=settings.backend_type,
257 |         max_tokens=settings.max_tokens,
258 |         load_in_8bit=settings.load_in_8bit,
259 |         verbose=settings.load_in_8bit,
260 |     )
261 | 
262 |     def set_settings(_settings: Settings):
263 |         global settings
264 |         settings = _settings
265 | 
266 |     set_settings(settings)
267 |     return app
268 | 
269 | 
270 | llama_outer_lock = Lock()
271 | llama_inner_lock = Lock()
272 | 
273 | 
274 | def get_llama():
275 |     # NOTE: This double lock allows the currently streaming llama model to
276 |     # check if any other requests are pending in the same thread and cancel
277 |     # the stream if so.
278 |     llama_outer_lock.acquire()
279 |     release_outer_lock = True
280 |     try:
281 |         llama_inner_lock.acquire()
282 |         try:
283 |             llama_outer_lock.release()
284 |             release_outer_lock = False
285 |             yield llama2
286 |         finally:
287 |             llama_inner_lock.release()
288 |     finally:
289 |         if release_outer_lock:
290 |             llama_outer_lock.release()
291 | 
292 | 
293 | def get_settings():
294 |     yield settings
295 | 
296 | 
297 | async def get_event_publisher(
298 |     request: Request,
299 |     inner_send_chan: MemoryObjectSendStream,
300 |     iterator: Iterator,
301 | ):
302 |     async with inner_send_chan:
303 |         try:
304 |             async for chunk in iterate_in_threadpool(iterator):
305 |                 await inner_send_chan.send(dict(data=json.dumps(chunk)))
306 |                 if await request.is_disconnected():
307 |                     raise anyio.get_cancelled_exc_class()()
308 |                 if settings.interrupt_requests and llama_outer_lock.locked():
309 |                     await inner_send_chan.send(dict(data="[DONE]"))
310 |                     raise anyio.get_cancelled_exc_class()()
311 |             await inner_send_chan.send(dict(data="[DONE]"))
312 |         except anyio.get_cancelled_exc_class() as e:
313 |             print("disconnected")
314 |             with anyio.move_on_after(1, shield=True):
315 |                 print(f"Disconnected from client (via refresh/close) {request.client}")
316 |                 raise e
317 | 
318 | 
319 | stream_field = Field(
320 |     default=False,
321 |     description="Whether to stream the results as they are generated. Useful for chatbots.",
322 | )
323 | max_new_tokens_field = Field(
324 |     default=1000, ge=1, description="The maximum number of tokens to generate."
325 | )
326 | 
327 | temperature_field = Field(
328 |     default=0.9,
329 |     ge=0.0,
330 |     le=2.0,
331 |     description="The temperature to use for sampling.",
332 | )
333 | 
334 | top_p_field = Field(
335 |     default=1.0,
336 |     ge=0.0,
337 |     le=1.0,
338 |     description="The top-p value to use for sampling.",
339 | )
340 | top_k_field = Field(
341 |     default=40,
342 |     ge=0,
343 |     description="The top-k value to use for sampling.",
344 | )
345 | repetition_penalty_field = Field(
346 |     default=1.0,
347 |     ge=0.0,
348 |     description="The penalty to apply to repeated tokens.",
349 | )
350 | # stop_field = Field(
351 | #     default=None,
352 | #     description="A list of tokens at which to stop generation. If None, no stop tokens are used.",
353 | # )
354 | 
355 | 
356 | class CreateCompletionRequest(BaseModel):
357 |     prompt: Union[str, List[str]] = Field(
358 |         default="", description="The prompt to generate text from."
359 |     )
360 |     stream: bool = stream_field
361 |     max_new_tokens: int = max_new_tokens_field
362 |     temperature: float = temperature_field
363 |     top_p: float = top_p_field
364 |     top_k: int = top_k_field
365 |     repetition_penalty: float = repetition_penalty_field
366 |     # stop: Optional[Union[str, List[str]]] = stop_field
367 | 
368 |     model_config = {
369 |         "json_schema_extra": {
370 |             "examples": [
371 |                 {
372 |                     "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n",
373 |                     # "stop": ["\n", "###"],
374 |                 }
375 |             ]
376 |         }
377 |     }
378 | 
379 | 
380 | @router.post(
381 |     "/v1/completions",
382 | )
383 | async def create_completion(
384 |     request: Request,
385 |     body: CreateCompletionRequest,
386 |     llama2: LLAMA2_WRAPPER = Depends(get_llama),
387 | ) -> Completion:
388 |     if isinstance(body.prompt, list):
389 |         assert len(body.prompt) <= 1
390 |         body.prompt = body.prompt[0] if len(body.prompt) > 0 else ""
391 | 
392 |     kwargs = body.model_dump()
393 | 
394 |     iterator_or_completion: Union[
395 |         Completion, Iterator[CompletionChunk]
396 |     ] = await run_in_threadpool(llama2.completion, **kwargs)
397 | 
398 |     if isinstance(iterator_or_completion, Iterator):
399 |         first_response = await run_in_threadpool(next, iterator_or_completion)
400 | 
401 |         # If no exception was raised from first_response, we can assume that
402 |         # the iterator is valid and we can use it to stream the response.
403 |         def iterator() -> Iterator[CompletionChunk]:
404 |             yield first_response
405 |             yield from iterator_or_completion
406 | 
407 |         send_chan, recv_chan = anyio.create_memory_object_stream(10)
408 |         return EventSourceResponse(
409 |             recv_chan,
410 |             data_sender_callable=partial(  # type: ignore
411 |                 get_event_publisher,
412 |                 request=request,
413 |                 inner_send_chan=send_chan,
414 |                 iterator=iterator(),
415 |             ),
416 |         )
417 |     else:
418 |         return iterator_or_completion
419 | 
420 | 
421 | class ChatCompletionRequestMessage(BaseModel):
422 |     role: Literal["system", "user", "assistant"] = Field(
423 |         default="user", description="The role of the message."
424 |     )
425 |     content: str = Field(default="", description="The content of the message.")
426 | 
427 | 
428 | class CreateChatCompletionRequest(BaseModel):
429 |     messages: List[ChatCompletionRequestMessage] = Field(
430 |         default=[], description="A list of messages to generate completions for."
431 |     )
432 |     stream: bool = stream_field
433 |     max_new_tokens: int = max_new_tokens_field
434 |     temperature: float = temperature_field
435 |     top_p: float = top_p_field
436 |     top_k: int = top_k_field
437 |     repetition_penalty: float = repetition_penalty_field
438 |     # stop: Optional[List[str]] = stop_field
439 | 
440 |     model_config = {
441 |         "json_schema_extra": {
442 |             "examples": [
443 |                 {
444 |                     "messages": [
445 |                         ChatCompletionRequestMessage(
446 |                             role="system", content="You are a helpful assistant."
447 |                         ).model_dump(),
448 |                         ChatCompletionRequestMessage(
449 |                             role="user", content="What is the capital of France?"
450 |                         ).model_dump(),
451 |                     ]
452 |                 }
453 |             ]
454 |         }
455 |     }
456 | 
457 | 
458 | @router.post(
459 |     "/v1/chat/completions",
460 | )
461 | async def create_chat_completion(
462 |     request: Request,
463 |     body: CreateChatCompletionRequest,
464 |     llama2: LLAMA2_WRAPPER = Depends(get_llama),
465 |     settings: Settings = Depends(get_settings),
466 | ) -> ChatCompletion:
467 |     kwargs = body.model_dump()
468 | 
469 |     iterator_or_completion: Union[
470 |         ChatCompletion, Iterator[ChatCompletionChunk]
471 |     ] = await run_in_threadpool(llama2.chat_completion, **kwargs)
472 | 
473 |     if isinstance(iterator_or_completion, Iterator):
474 |         first_response = await run_in_threadpool(next, iterator_or_completion)
475 | 
476 |         # If no exception was raised from first_response, we can assume that
477 |         # the iterator is valid and we can use it to stream the response.
478 |         def iterator() -> Iterator[ChatCompletionChunk]:
479 |             yield first_response
480 |             yield from iterator_or_completion
481 | 
482 |         send_chan, recv_chan = anyio.create_memory_object_stream(10)
483 |         return EventSourceResponse(
484 |             recv_chan,
485 |             data_sender_callable=partial(  # type: ignore
486 |                 get_event_publisher,
487 |                 request=request,
488 |                 inner_send_chan=send_chan,
489 |                 iterator=iterator(),
490 |             ),
491 |         )
492 |     else:
493 |         return iterator_or_completion
494 | 
495 | 
496 | class ModelData(TypedDict):
497 |     id: str
498 |     object: Literal["model"]
499 |     owned_by: str
500 |     permissions: List[str]
501 | 
502 | 
503 | class ModelList(TypedDict):
504 |     object: Literal["list"]
505 |     data: List[ModelData]
506 | 
507 | 
508 | @router.get("/v1/models")
509 | async def get_models(
510 |     settings: Settings = Depends(get_settings),
511 | ) -> ModelList:
512 |     assert llama2 is not None
513 | 
514 |     return {
515 |         "object": "list",
516 |         "data": [
517 |             {
518 |                 "id": settings.backend_type + " default model"
519 |                 if settings.model_path == ""
520 |                 else settings.model_path,
521 |                 "object": "model",
522 |                 "owned_by": "me",
523 |                 "permissions": [],
524 |             }
525 |         ],
526 |     }
527 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # llama2-webui
  2 | 
  3 | Running Llama 2 with gradio web UI on GPU or CPU from anywhere (Linux/Windows/Mac). 
  4 | - Supporting all Llama 2 models (7B, 13B, 70B, GPTQ, GGML, GGUF, [CodeLlama](https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GPTQ)) with 8-bit, 4-bit mode. 
  5 | - Use [llama2-wrapper](https://pypi.org/project/llama2-wrapper/) as your local llama2 backend for Generative Agents/Apps; [colab example](./colab/Llama_2_7b_Chat_GPTQ.ipynb). 
  6 | - [Run OpenAI Compatible API](#start-openai-compatible-api) on Llama2 models.
  7 | 
  8 | ![screenshot](./static/screenshot.png)
  9 | 
 10 | ![code_llama_playground](https://i.imgur.com/FgMUiT6.gif)
 11 | 
 12 | ## Features
 13 | 
 14 | - Supporting models: [Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)/[13b](https://huggingface.co/llamaste/Llama-2-13b-chat-hf)/[70b](https://huggingface.co/llamaste/Llama-2-70b-chat-hf), [Llama-2-GPTQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-GPTQ), [Llama-2-GGML](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML),  [Llama-2-GGUF](https://huggingface.co/TheBloke/Llama-2-7b-Chat-GGUF),  [CodeLlama](https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GPTQ) ...
 15 | - Supporting model backends: [transformers](https://github.com/huggingface/transformers), [bitsandbytes(8-bit inference)](https://github.com/TimDettmers/bitsandbytes), [AutoGPTQ(4-bit inference)](https://github.com/PanQiWei/AutoGPTQ), [llama.cpp](https://github.com/ggerganov/llama.cpp)
 16 | - Demos: [Run Llama2 on MacBook Air](https://twitter.com/liltom_eth/status/1682791729207070720?s=20); [Run Llama2 on free Colab T4 GPU](./colab/Llama_2_7b_Chat_GPTQ.ipynb)
 17 | - Use  [llama2-wrapper](https://pypi.org/project/llama2-wrapper/)  as your local llama2 backend for Generative Agents/Apps; [colab example](./colab/Llama_2_7b_Chat_GPTQ.ipynb).  
 18 | - [Run OpenAI Compatible API](#start-openai-compatible-api) on Llama2 models.
 19 | - [News](./docs/news.md), [Benchmark](./docs/performance.md), [Issue Solutions](./docs/issues.md)
 20 | 
 21 | ## Contents
 22 | 
 23 | - [Install](#install)
 24 | - [Usage](#usage)
 25 |   - [Start Chat UI](#start-chat-ui)
 26 |   - [Start Code Llama UI](#start-code-llama-ui)
 27 |   - [Use llama2-wrapper for Your App](#use-llama2-wrapper-for-your-app)
 28 |   - [Start OpenAI Compatible API](#start-openai-compatible-api)
 29 | - [Benchmark](#benchmark)
 30 | - [Download Llama-2 Models](#download-llama-2-models)
 31 |   - [Model List](#model-list)
 32 |   - [Download Script](#download-script)
 33 | - [Tips](#tips)
 34 |   - [Env Examples](#env-examples)
 35 |   - [Run on Nvidia GPU](#run-on-nvidia-gpu)
 36 |     - [Run bitsandbytes 8 bit](#run-bitsandbytes-8-bit)
 37 |     - [Run GPTQ 4 bit](#run-gptq-4-bit)
 38 |   - [Run on CPU](#run-on-cpu)
 39 |     - [Mac Metal Acceleration](#mac-metal-acceleration)
 40 |     - [AMD/Nvidia GPU Acceleration](#amdnvidia-gpu-acceleration)
 41 | - [License](#license)
 42 | - [Contributing](#contributing)
 43 | 
 44 | 
 45 | 
 46 | ## Install
 47 | ### Method 1: From [PyPI](https://pypi.org/project/llama2-wrapper/)
 48 | ```
 49 | pip install llama2-wrapper
 50 | ```
 51 | The newest `llama2-wrapper>=0.1.14` supports llama.cpp's `gguf` models.
 52 | 
 53 | If you would like to use old `ggml` models, install `llama2-wrapper<=0.1.13` or manually install `llama-cpp-python==0.1.77`.
 54 | 
 55 | ### Method 2: From Source:
 56 | 
 57 | ```
 58 | git clone https://github.com/liltom-eth/llama2-webui.git
 59 | cd llama2-webui
 60 | pip install -r requirements.txt
 61 | ```
 62 | ### Install Issues:
 63 | `bitsandbytes >= 0.39` may not work on older NVIDIA GPUs. In that case, to use `LOAD_IN_8BIT`, you may have to downgrade like this:
 64 | 
 65 | -  `pip install bitsandbytes==0.38.1`
 66 | 
 67 | `bitsandbytes` also need a special install for Windows:
 68 | 
 69 | ```
 70 | pip uninstall bitsandbytes
 71 | pip install https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.0-py3-none-win_amd64.whl
 72 | ```
 73 | 
 74 | ## Usage
 75 | 
 76 | ### Start Chat UI
 77 | 
 78 | Run chatbot simply with web UI:
 79 | 
 80 | ```bash
 81 | python app.py
 82 | ```
 83 | 
 84 | `app.py` will load the default config `.env` which uses `llama.cpp` as the backend to run `llama-2-7b-chat.ggmlv3.q4_0.bin` model for inference. The model `llama-2-7b-chat.ggmlv3.q4_0.bin` will be automatically downloaded.
 85 | 
 86 | ```bash
 87 | Running on backend llama.cpp.
 88 | Use default model path: ./models/llama-2-7b-chat.Q4_0.gguf
 89 | Start downloading model to: ./models/llama-2-7b-chat.Q4_0.gguf
 90 | ```
 91 | 
 92 | You can also customize your `MODEL_PATH`, `BACKEND_TYPE,` and model configs in `.env` file to run different llama2 models on different backends (llama.cpp, transformers, gptq). 
 93 | 
 94 | ### Start Code Llama UI
 95 | 
 96 | We provide a code completion / filling UI for Code Llama.
 97 | 
 98 | Base model **Code Llama** and extend model **Code Llama — Python** are not fine-tuned to follow instructions. They should be prompted so that the expected answer is the natural continuation of the prompt. That means these two models focus on code filling and code completion.
 99 | 
100 | Here is an example run CodeLlama code completion on llama.cpp backend:
101 | 
102 | ``` 
103 | python code_completion.py --model_path ./models/codellama-7b.Q4_0.gguf
104 | ```
105 | 
106 | ![code_llama_playground](https://i.imgur.com/FgMUiT6.gif)
107 | 
108 | `codellama-7b.Q4_0.gguf` can be downloaded from [TheBloke/CodeLlama-7B-GGUF](https://huggingface.co/TheBloke/CodeLlama-7B-GGUF/blob/main/codellama-7b.Q4_0.gguf).
109 | 
110 | **Code Llama — Instruct** trained with “natural language instruction” inputs paired with anticipated outputs. This strategic methodology enhances the model’s capacity to grasp human expectations in prompts. That means instruct models can be used in a chatbot-like app.
111 | 
112 | Example run CodeLlama chat on gptq backend:
113 | 
114 | ```
115 | python app.py --backend_type gptq --model_path ./models/CodeLlama-7B-Instruct-GPTQ/ --share True
116 | ```
117 | 
118 | ![code_llama_chat](https://i.imgur.com/lQLfemB.gif)
119 | 
120 | `CodeLlama-7B-Instruct-GPTQ` can be downloaded from [TheBloke/CodeLlama-7B-Instruct-GPTQ](https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GPTQ)
121 | 
122 | ### Use llama2-wrapper for Your App
123 | 
124 | 🔥 For developers, we released `llama2-wrapper`  as a llama2 backend wrapper in [PYPI](https://pypi.org/project/llama2-wrapper/).
125 | 
126 | Use  `llama2-wrapper`  as your local llama2 backend to answer questions and more, [colab example](./colab/ggmlv3_q4_0.ipynb):
127 | 
128 | ```python
129 | # pip install llama2-wrapper
130 | from llama2_wrapper import LLAMA2_WRAPPER, get_prompt 
131 | llama2_wrapper = LLAMA2_WRAPPER()
132 | # Default running on backend llama.cpp.
133 | # Automatically downloading model to: ./models/llama-2-7b-chat.ggmlv3.q4_0.bin
134 | prompt = "Do you know Pytorch"
135 | answer = llama2_wrapper(get_prompt(prompt), temperature=0.9)
136 | ```
137 | 
138 | Run gptq llama2 model on Nvidia GPU, [colab example](./colab/Llama_2_7b_Chat_GPTQ.ipynb):
139 | 
140 | ```python
141 | from llama2_wrapper import LLAMA2_WRAPPER 
142 | llama2_wrapper = LLAMA2_WRAPPER(backend_type="gptq")
143 | # Automatically downloading model to: ./models/Llama-2-7b-Chat-GPTQ
144 | ```
145 | 
146 | Run llama2 7b with bitsandbytes 8 bit with a `model_path`:
147 | 
148 | ```python
149 | from llama2_wrapper import LLAMA2_WRAPPER 
150 | llama2_wrapper = LLAMA2_WRAPPER(
151 | 	model_path = "./models/Llama-2-7b-chat-hf",
152 |   backend_type = "transformers",
153 |   load_in_8bit = True
154 | )
155 | ```
156 | Check [API Document](https://pypi.org/project/llama2-wrapper/) for more usages.
157 | 
158 | ### Start OpenAI Compatible API
159 | 
160 | `llama2-wrapper` offers a web server that acts as a drop-in replacement for the OpenAI API. This allows you to use Llama2 models with any OpenAI compatible clients, libraries or services, etc.
161 | 
162 | Start Fast API:
163 | 
164 | ```
165 | python -m llama2_wrapper.server
166 | ```
167 | 
168 | it will use `llama.cpp` as the backend by default to run `llama-2-7b-chat.ggmlv3.q4_0.bin` model.
169 | 
170 | Start Fast API for `gptq` backend:
171 | 
172 | ```
173 | python -m llama2_wrapper.server --backend_type gptq
174 | ```
175 | 
176 | Navigate to http://localhost:8000/docs to see the OpenAPI documentation.
177 | 
178 | #### Basic settings
179 | 
180 | | Flag             | Description                                                  |
181 | | ---------------- | ------------------------------------------------------------ |
182 | | `-h`, `--help`   | Show this help message.                                      |
183 | | `--model_path`   | The path to the model to use for generating completions.     |
184 | | `--backend_type` | Backend for llama2, options: llama.cpp, gptq, transformers   |
185 | | `--max_tokens`   | Maximum context size.                                        |
186 | | `--load_in_8bit` | Whether to use bitsandbytes to run model in 8 bit mode (only for transformers models). |
187 | | `--verbose`      | Whether to print verbose output to stderr.                   |
188 | | `--host`         | API address                                                  |
189 | | `--port`         | API port                                                     |
190 | 
191 | ## Benchmark
192 | 
193 | Run benchmark script to compute performance on your device, `benchmark.py` will load the same `.env` as `app.py`.:
194 | 
195 | ```bash
196 | python benchmark.py
197 | ```
198 | 
199 | You can also select the `iter`, `backend_type` and `model_path` the benchmark will be run (overwrite .env args) :
200 | 
201 | ```bash
202 | python benchmark.py --iter NB_OF_ITERATIONS --backend_type gptq
203 | ```
204 | 
205 |  By default, the number of iterations is 5, but if you want a faster result or a more accurate one 
206 |  you can set it to whatever value you want, but please only report results with at least 5 iterations.
207 | 
208 | This [colab example](./colab/Llama_2_7b_Chat_GPTQ.ipynb) also show you how to benchmark gptq model on free Google Colab T4 GPU.
209 | 
210 | Some benchmark performance:
211 | 
212 | | Model                       | Precision | Device             | RAM / GPU VRAM | Speed (tokens/sec) | load time (s) |
213 | | --------------------------- | --------- | ------------------ | -------------- | ------------------ | ------------- |
214 | | Llama-2-7b-chat-hf          | 8 bit     | NVIDIA RTX 2080 Ti | 7.7 GB VRAM    | 3.76               | 641.36        |
215 | | Llama-2-7b-Chat-GPTQ        | 4 bit     | NVIDIA RTX 2080 Ti | 5.8 GB VRAM    | 18.85              | 192.91        |
216 | | Llama-2-7b-Chat-GPTQ        | 4 bit     | Google Colab T4    | 5.8 GB VRAM    | 18.19              | 37.44         |
217 | | llama-2-7b-chat.ggmlv3.q4_0 | 4 bit     | Apple M1 Pro CPU   | 5.4 GB RAM     | 17.90              | 0.18          |
218 | | llama-2-7b-chat.ggmlv3.q4_0 | 4 bit     | Apple M2 CPU       | 5.4 GB RAM     | 13.70              | 0.13          |
219 | | llama-2-7b-chat.ggmlv3.q4_0 | 4 bit     | Apple M2 Metal     | 5.4 GB RAM     | 12.60              | 0.10          |
220 | | llama-2-7b-chat.ggmlv3.q2_K | 2 bit     | Intel i7-8700      | 4.5 GB RAM     | 7.88               | 31.90         |
221 | 
222 | Check/contribute the performance of your device in the full [performance doc](./docs/performance.md).
223 | 
224 | ## Download Llama-2 Models
225 | 
226 | Llama 2 is a collection of pre-trained and fine-tuned generative text models ranging in scale from 7 billion to 70 billion parameters.
227 | 
228 | Llama-2-7b-Chat-GPTQ is the GPTQ model files for [Meta's Llama 2 7b Chat](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf). GPTQ 4-bit Llama-2 model require less GPU VRAM to run it.
229 | 
230 | ### Model List
231 | 
232 | | Model Name                          | set MODEL_PATH in .env                   | Download URL                                                 |
233 | | ----------------------------------- | ---------------------------------------- | ------------------------------------------------------------ |
234 | | meta-llama/Llama-2-7b-chat-hf       | /path-to/Llama-2-7b-chat-hf              | [Link](https://huggingface.co/llamaste/Llama-2-7b-chat-hf)   |
235 | | meta-llama/Llama-2-13b-chat-hf      | /path-to/Llama-2-13b-chat-hf             | [Link](https://huggingface.co/llamaste/Llama-2-13b-chat-hf)  |
236 | | meta-llama/Llama-2-70b-chat-hf      | /path-to/Llama-2-70b-chat-hf             | [Link](https://huggingface.co/llamaste/Llama-2-70b-chat-hf)  |
237 | | meta-llama/Llama-2-7b-hf            | /path-to/Llama-2-7b-hf                   | [Link](https://huggingface.co/meta-llama/Llama-2-7b-hf)      |
238 | | meta-llama/Llama-2-13b-hf           | /path-to/Llama-2-13b-hf                  | [Link](https://huggingface.co/meta-llama/Llama-2-13b-hf)     |
239 | | meta-llama/Llama-2-70b-hf           | /path-to/Llama-2-70b-hf                  | [Link](https://huggingface.co/meta-llama/Llama-2-70b-hf)     |
240 | | TheBloke/Llama-2-7b-Chat-GPTQ       | /path-to/Llama-2-7b-Chat-GPTQ            | [Link](https://huggingface.co/TheBloke/Llama-2-7b-Chat-GPTQ) |
241 | | TheBloke/Llama-2-7b-Chat-GGUF       | /path-to/llama-2-7b-chat.Q4_0.gguf       | [Link](https://huggingface.co/TheBloke/Llama-2-7b-Chat-GGUF/blob/main/llama-2-7b-chat.Q4_0.gguf) |
242 | | TheBloke/Llama-2-7B-Chat-GGML       | /path-to/llama-2-7b-chat.ggmlv3.q4_0.bin | [Link](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML) |
243 | | TheBloke/CodeLlama-7B-Instruct-GPTQ | TheBloke/CodeLlama-7B-Instruct-GPTQ      | [Link](https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GPTQ) |
244 | | ...                                 | ...                                      | ...                                                          |
245 | 
246 | Running 4-bit model `Llama-2-7b-Chat-GPTQ` needs GPU with 6GB VRAM. 
247 | 
248 | Running 4-bit model `llama-2-7b-chat.ggmlv3.q4_0.bin` needs CPU with 6GB RAM. There is also a list of other 2, 3, 4, 5, 6, 8-bit GGML models that can be used from [TheBloke/Llama-2-7B-Chat-GGML](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML).
249 | 
250 | ### Download Script
251 | 
252 | These models can be downloaded through:
253 | 
254 | ```bash
255 | python -m llama2_wrapper.download --repo_id TheBloke/CodeLlama-7B-Python-GPTQ
256 | 
257 | python -m llama2_wrapper.download --repo_id TheBloke/Llama-2-7b-Chat-GGUF --filename llama-2-7b-chat.Q4_0.gguf --save_dir ./models
258 | ```
259 | 
260 | Or use CMD like:
261 | 
262 | ```bash
263 | # Make sure you have git-lfs installed (https://git-lfs.com)
264 | git lfs install
265 | git clone git@hf.co:meta-llama/Llama-2-7b-chat-hf
266 | ```
267 | 
268 | To download Llama 2 models, you need to request access from [https://ai.meta.com/llama/](https://ai.meta.com/llama/) and also enable access on repos like [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/tree/main). Requests will be processed in hours.
269 | 
270 | For GPTQ models like [TheBloke/Llama-2-7b-Chat-GPTQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-GPTQ), you can directly download without requesting access.
271 | 
272 | For GGML models like [TheBloke/Llama-2-7B-Chat-GGML](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML), you can directly download without requesting access.
273 | 
274 | ## Tips
275 | 
276 | ### Env Examples
277 | 
278 | There are some examples in `./env_examples/` folder.
279 | 
280 | | Model Setup                                            | Example .env                |
281 | | ------------------------------------------------------ | --------------------------- |
282 | | Llama-2-7b-chat-hf 8-bit (transformers backend)        | .env.7b_8bit_example        |
283 | | Llama-2-7b-Chat-GPTQ 4-bit (gptq transformers backend) | .env.7b_gptq_example        |
284 | | Llama-2-7B-Chat-GGML 4bit (llama.cpp backend)          | .env.7b_ggmlv3_q4_0_example |
285 | | Llama-2-13b-chat-hf (transformers backend)             | .env.13b_example            |
286 | | ...                                                    | ...                         |
287 | 
288 | ### Run on Nvidia GPU
289 | 
290 | The running requires around 14GB of GPU VRAM for Llama-2-7b and 28GB of GPU VRAM for Llama-2-13b. 
291 | 
292 | If you are running on multiple GPUs, the model will be loaded automatically on GPUs and split the VRAM usage. That allows you to run Llama-2-7b (requires 14GB of GPU VRAM) on a setup like 2 GPUs (11GB VRAM each).
293 | 
294 | #### Run bitsandbytes 8 bit
295 | 
296 | If you do not have enough memory,  you can set up your `LOAD_IN_8BIT` as `True` in `.env`. This can reduce memory usage by around half with slightly degraded model quality. It is compatible with the CPU, GPU, and Metal backend.
297 | 
298 | Llama-2-7b with 8-bit compression can run on a single GPU with 8 GB of VRAM, like an Nvidia RTX 2080Ti, RTX 4080, T4, V100 (16GB).
299 | 
300 | #### Run GPTQ 4 bit
301 | 
302 | If you want to run 4 bit  Llama-2 model like `Llama-2-7b-Chat-GPTQ`,  you can set up your `BACKEND_TYPE` as `gptq` in `.env` like example `.env.7b_gptq_example`. 
303 | 
304 | Make sure you have downloaded the 4-bit model from `Llama-2-7b-Chat-GPTQ` and set the `MODEL_PATH` and arguments in `.env` file.
305 | 
306 | `Llama-2-7b-Chat-GPTQ` can run on a single GPU with 6 GB of VRAM.
307 | 
308 | If you encounter issue like `NameError: name 'autogptq_cuda_256' is not defined`, please refer to [here](https://huggingface.co/TheBloke/open-llama-13b-open-instruct-GPTQ/discussions/1)
309 | > pip install https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.3.0/auto_gptq-0.3.0+cu117-cp310-cp310-linux_x86_64.whl  
310 | 
311 | ### Run on CPU
312 | 
313 | Run Llama-2 model on CPU requires [llama.cpp](https://github.com/ggerganov/llama.cpp) dependency and [llama.cpp Python Bindings](https://github.com/abetlen/llama-cpp-python), which are already installed. 
314 | 
315 | 
316 | Download GGML models like `llama-2-7b-chat.ggmlv3.q4_0.bin` following [Download Llama-2 Models](#download-llama-2-models) section. `llama-2-7b-chat.ggmlv3.q4_0.bin` model requires at least 6 GB RAM to run on CPU.
317 | 
318 | Set up configs like `.env.7b_ggmlv3_q4_0_example` from `env_examples` as `.env`.
319 | 
320 | Run web UI `python app.py` .
321 | 
322 | #### Mac Metal Acceleration
323 | 
324 | For Mac users, you can also set up Mac Metal for acceleration, try install this dependencies:
325 | 
326 | ```bash
327 | pip uninstall llama-cpp-python -y
328 | CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install -U llama-cpp-python --no-cache-dir
329 | pip install 'llama-cpp-python[server]'
330 | ```
331 | 
332 | or check details:
333 | 
334 | - [MacOS Install with Metal GPU](https://github.com/abetlen/llama-cpp-python/blob/main/docs/install/macos.md)
335 | 
336 | #### AMD/Nvidia GPU Acceleration
337 | 
338 | If you would like to use AMD/Nvidia GPU for acceleration, check this:
339 | 
340 | - [Installation with OpenBLAS / cuBLAS / CLBlast / Metal](https://github.com/abetlen/llama-cpp-python#installation-with-openblas--cublas--clblast--metal)
341 | 
342 | 
343 | 
344 | 
345 | 
346 | ## License
347 | 
348 | MIT - see [MIT License](LICENSE)
349 | 
350 | This project enables users to adapt it freely for proprietary purposes without any restrictions.
351 | 
352 | ## Contributing
353 | 
354 | Kindly read our [Contributing Guide](CONTRIBUTING.md) to learn and understand our development process.
355 | 
356 | ### All Contributors
357 | 
358 | <a href="https://github.com/liltom-eth/llama2-webui/graphs/contributors">
359 |   <img src="https://contrib.rocks/image?repo=liltom-eth/llama2-webui" />
360 | </a>
361 | 
362 | ### Review
363 | <a href='https://github.com/repo-reviews/repo-reviews.github.io/blob/main/create.md' target="_blank"><img alt='Github' src='https://img.shields.io/badge/review-100000?style=flat&logo=Github&logoColor=white&labelColor=888888&color=555555'/></a>
364 | 
365 | ### Star History
366 | 
367 | [![Star History Chart](https://api.star-history.com/svg?repos=liltom-eth/llama2-webui&type=Date)](https://star-history.com/#liltom-eth/llama2-webui&Date)
368 | 
369 | ## Credits
370 | 
371 | - https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
372 | - https://huggingface.co/spaces/huggingface-projects/llama-2-7b-chat
373 | - https://huggingface.co/TheBloke/Llama-2-7b-Chat-GPTQ
374 | - [https://github.com/ggerganov/llama.cpp](https://github.com/ggerganov/llama.cpp)
375 | - [https://github.com/TimDettmers/bitsandbytes](https://github.com/TimDettmers/bitsandbytes)
376 | - [https://github.com/PanQiWei/AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ)
377 | - [https://github.com/abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
378 | 


--------------------------------------------------------------------------------
/colab/webui_CodeLlama_7B_Instruct_GPTQ.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "provenance": [],
  7 |       "gpuType": "T4",
  8 |       "authorship_tag": "ABX9TyOZhPcZe61RhDjhEFQv0vrl",
  9 |       "include_colab_link": true
 10 |     },
 11 |     "kernelspec": {
 12 |       "name": "python3",
 13 |       "display_name": "Python 3"
 14 |     },
 15 |     "language_info": {
 16 |       "name": "python"
 17 |     },
 18 |     "accelerator": "GPU"
 19 |   },
 20 |   "cells": [
 21 |     {
 22 |       "cell_type": "markdown",
 23 |       "metadata": {
 24 |         "id": "view-in-github",
 25 |         "colab_type": "text"
 26 |       },
 27 |       "source": [
 28 |         "<a href=\"https://colab.research.google.com/github/liltom-eth/llama2-webui/blob/main/colab/webui_CodeLlama_7B_Instruct_GPTQ.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
 29 |       ]
 30 |     },
 31 |     {
 32 |       "cell_type": "code",
 33 |       "execution_count": null,
 34 |       "metadata": {
 35 |         "id": "7O5JSosg5-rx"
 36 |       },
 37 |       "outputs": [],
 38 |       "source": [
 39 |         "!pip install -U llama2-wrapper==0.1.12"
 40 |       ]
 41 |     },
 42 |     {
 43 |       "cell_type": "code",
 44 |       "source": [
 45 |         "%cd /content\n",
 46 |         "!git clone https://github.com/liltom-eth/llama2-webui\n",
 47 |         "\n",
 48 |         "%cd /content/llama2-webui\n",
 49 |         "!python -m llama2_wrapper.download --repo_id TheBloke/CodeLlama-7B-Instruct-GPTQ\n",
 50 |         "\n",
 51 |         "%cd /content/llama2-webui\n",
 52 |         "!python app.py --backend_type gptq --model_path ./models/CodeLlama-7B-Instruct-GPTQ/ --share True"
 53 |       ],
 54 |       "metadata": {
 55 |         "colab": {
 56 |           "base_uri": "https://localhost:8080/"
 57 |         },
 58 |         "id": "Y6A7bJdkmzY8",
 59 |         "outputId": "0d702a7d-68ab-4747-f012-246d4dee3718"
 60 |       },
 61 |       "execution_count": 4,
 62 |       "outputs": [
 63 |         {
 64 |           "output_type": "stream",
 65 |           "name": "stdout",
 66 |           "text": [
 67 |             "/content\n",
 68 |             "fatal: destination path 'llama2-webui' already exists and is not an empty directory.\n",
 69 |             "/content/llama2-webui\n",
 70 |             "Start downloading model TheBloke/CodeLlama-7B-Instruct-GPTQ to: ./models/CodeLlama-7B-Instruct-GPTQ\n",
 71 |             "Fetching 15 files:   0% 0/15 [00:00<?, ?it/s]\n",
 72 |             "Downloading (…)d0d05/.gitattributes: 100% 1.52k/1.52k [00:00<00:00, 7.94MB/s]\n",
 73 |             "Fetching 15 files:   7% 1/15 [00:01<00:16,  1.15s/it]\n",
 74 |             "Downloading (…)478d0d05/LICENSE.txt: 100% 7.02k/7.02k [00:00<00:00, 31.6MB/s]\n",
 75 |             "\n",
 76 |             "Downloading (…)478d0d05/config.json: 100% 1.25k/1.25k [00:00<00:00, 7.95MB/s]\n",
 77 |             "\n",
 78 |             "Downloading (…)nfiguration_llama.py: 100% 8.56k/8.56k [00:00<00:00, 41.7MB/s]\n",
 79 |             "\n",
 80 |             "Downloading (…)81b84478d0d05/Notice: 100% 112/112 [00:00<00:00, 750kB/s]\n",
 81 |             "\n",
 82 |             "Downloading (…)neration_config.json: 100% 132/132 [00:00<00:00, 836kB/s]\n",
 83 |             "\n",
 84 |             "Downloading (…)8d0d05/USE_POLICY.md: 100% 105/105 [00:00<00:00, 686kB/s]\n",
 85 |             "\n",
 86 |             "Downloading (…)84478d0d05/README.md: 100% 22.0k/22.0k [00:00<00:00, 59.5MB/s]\n",
 87 |             "\n",
 88 |             "Downloading (…)05/modeling_llama.py: 100% 45.9k/45.9k [00:00<00:00, 27.5MB/s]\n",
 89 |             "\n",
 90 |             "Downloading (…)quantize_config.json: 100% 187/187 [00:00<00:00, 1.34MB/s]\n",
 91 |             "\n",
 92 |             "Downloading (…)cial_tokens_map.json: 100% 411/411 [00:00<00:00, 2.82MB/s]\n",
 93 |             "\n",
 94 |             "Downloading (…)d0d05/tokenizer.json:   0% 0.00/1.84M [00:00<?, ?B/s]\u001b[A\n",
 95 |             "\n",
 96 |             "Downloading (…)okenizer_config.json: 100% 824/824 [00:00<00:00, 5.75MB/s]\n",
 97 |             "\n",
 98 |             "\n",
 99 |             "Downloading model.safetensors:   0% 0.00/3.90G [00:00<?, ?B/s]\u001b[A\u001b[A\n",
100 |             "\n",
101 |             "\n",
102 |             "Downloading tokenizer.model: 100% 500k/500k [00:00<00:00, 16.3MB/s]\n",
103 |             "\n",
104 |             "Downloading (…)d0d05/tokenizer.json: 100% 1.84M/1.84M [00:00<00:00, 5.47MB/s]\n",
105 |             "\n",
106 |             "\n",
107 |             "Downloading model.safetensors:   0% 10.5M/3.90G [00:00<01:08, 56.4MB/s]\u001b[A\u001b[A\n",
108 |             "\n",
109 |             "Downloading model.safetensors:   1% 21.0M/3.90G [00:00<00:57, 67.1MB/s]\u001b[A\u001b[A\n",
110 |             "\n",
111 |             "Downloading model.safetensors:   1% 31.5M/3.90G [00:00<00:51, 75.5MB/s]\u001b[A\u001b[A\n",
112 |             "\n",
113 |             "Downloading model.safetensors:   1% 52.4M/3.90G [00:00<00:40, 94.5MB/s]\u001b[A\u001b[A\n",
114 |             "\n",
115 |             "Downloading model.safetensors:   2% 73.4M/3.90G [00:00<00:33, 113MB/s] \u001b[A\u001b[A\n",
116 |             "\n",
117 |             "Downloading model.safetensors:   2% 94.4M/3.90G [00:00<00:28, 133MB/s]\u001b[A\u001b[A\n",
118 |             "\n",
119 |             "Downloading model.safetensors:   3% 115M/3.90G [00:00<00:25, 148MB/s] \u001b[A\u001b[A\n",
120 |             "\n",
121 |             "Downloading model.safetensors:   3% 136M/3.90G [00:01<00:24, 156MB/s]\u001b[A\u001b[A\n",
122 |             "\n",
123 |             "Downloading model.safetensors:   4% 157M/3.90G [00:01<00:22, 167MB/s]\u001b[A\u001b[A\n",
124 |             "\n",
125 |             "Downloading model.safetensors:   5% 178M/3.90G [00:01<00:22, 168MB/s]\u001b[A\u001b[A\n",
126 |             "\n",
127 |             "Downloading model.safetensors:   5% 199M/3.90G [00:01<00:21, 169MB/s]\u001b[A\u001b[A\n",
128 |             "\n",
129 |             "Downloading model.safetensors:   6% 220M/3.90G [00:01<00:21, 170MB/s]\u001b[A\u001b[A\n",
130 |             "\n",
131 |             "Downloading model.safetensors:   6% 241M/3.90G [00:01<00:21, 174MB/s]\u001b[A\u001b[A\n",
132 |             "\n",
133 |             "Downloading model.safetensors:   7% 262M/3.90G [00:01<00:20, 177MB/s]\u001b[A\u001b[A\n",
134 |             "\n",
135 |             "Downloading model.safetensors:   7% 283M/3.90G [00:02<01:08, 52.9MB/s]\u001b[A\u001b[A\n",
136 |             "\n",
137 |             "Downloading model.safetensors:   8% 315M/3.90G [00:02<00:47, 75.6MB/s]\u001b[A\u001b[A\n",
138 |             "\n",
139 |             "Downloading model.safetensors:   9% 346M/3.90G [00:03<00:36, 97.8MB/s]\u001b[A\u001b[A\n",
140 |             "\n",
141 |             "Downloading model.safetensors:   9% 367M/3.90G [00:03<00:31, 111MB/s] \u001b[A\u001b[A\n",
142 |             "\n",
143 |             "Downloading model.safetensors:  10% 388M/3.90G [00:03<00:28, 122MB/s]\u001b[A\u001b[A\n",
144 |             "\n",
145 |             "Downloading model.safetensors:  10% 409M/3.90G [00:03<00:26, 134MB/s]\u001b[A\u001b[A\n",
146 |             "\n",
147 |             "Downloading model.safetensors:  11% 430M/3.90G [00:03<00:24, 141MB/s]\u001b[A\u001b[A\n",
148 |             "\n",
149 |             "Downloading model.safetensors:  12% 461M/3.90G [00:03<00:21, 160MB/s]\u001b[A\u001b[A\n",
150 |             "\n",
151 |             "Downloading model.safetensors:  12% 482M/3.90G [00:03<00:20, 165MB/s]\u001b[A\u001b[A\n",
152 |             "\n",
153 |             "Downloading model.safetensors:  13% 503M/3.90G [00:04<00:20, 166MB/s]\u001b[A\u001b[A\n",
154 |             "\n",
155 |             "Downloading model.safetensors:  13% 524M/3.90G [00:04<00:19, 170MB/s]\u001b[A\u001b[A\n",
156 |             "\n",
157 |             "Downloading model.safetensors:  14% 556M/3.90G [00:04<00:18, 181MB/s]\u001b[A\u001b[A\n",
158 |             "\n",
159 |             "Downloading model.safetensors:  15% 577M/3.90G [00:04<00:18, 182MB/s]\u001b[A\u001b[A\n",
160 |             "\n",
161 |             "Downloading model.safetensors:  15% 598M/3.90G [00:04<00:18, 183MB/s]\u001b[A\u001b[A\n",
162 |             "\n",
163 |             "Downloading model.safetensors:  16% 619M/3.90G [00:04<00:17, 184MB/s]\u001b[A\u001b[A\n",
164 |             "\n",
165 |             "Downloading model.safetensors:  16% 640M/3.90G [00:04<00:17, 184MB/s]\u001b[A\u001b[A\n",
166 |             "\n",
167 |             "Downloading model.safetensors:  17% 661M/3.90G [00:04<00:18, 178MB/s]\u001b[A\u001b[A\n",
168 |             "\n",
169 |             "Downloading model.safetensors:  17% 682M/3.90G [00:04<00:17, 180MB/s]\u001b[A\u001b[A\n",
170 |             "\n",
171 |             "Downloading model.safetensors:  18% 703M/3.90G [00:05<00:17, 180MB/s]\u001b[A\u001b[A\n",
172 |             "\n",
173 |             "Downloading model.safetensors:  19% 724M/3.90G [00:05<00:17, 181MB/s]\u001b[A\u001b[A\n",
174 |             "\n",
175 |             "Downloading model.safetensors:  19% 744M/3.90G [00:05<00:18, 171MB/s]\u001b[A\u001b[A\n",
176 |             "\n",
177 |             "Downloading model.safetensors:  20% 765M/3.90G [00:05<00:18, 173MB/s]\u001b[A\u001b[A\n",
178 |             "\n",
179 |             "Downloading model.safetensors:  20% 786M/3.90G [00:05<00:17, 175MB/s]\u001b[A\u001b[A\n",
180 |             "\n",
181 |             "Downloading model.safetensors:  21% 807M/3.90G [00:05<00:17, 178MB/s]\u001b[A\u001b[A\n",
182 |             "\n",
183 |             "Downloading model.safetensors:  21% 828M/3.90G [00:05<00:17, 180MB/s]\u001b[A\u001b[A\n",
184 |             "\n",
185 |             "Downloading model.safetensors:  22% 849M/3.90G [00:05<00:16, 182MB/s]\u001b[A\u001b[A\n",
186 |             "\n",
187 |             "Downloading model.safetensors:  22% 870M/3.90G [00:07<01:37, 30.9MB/s]\u001b[A\u001b[A\n",
188 |             "\n",
189 |             "Downloading model.safetensors:  23% 891M/3.90G [00:08<01:13, 40.8MB/s]\u001b[A\u001b[A\n",
190 |             "\n",
191 |             "Downloading model.safetensors:  24% 923M/3.90G [00:08<00:50, 59.3MB/s]\u001b[A\u001b[A\n",
192 |             "\n",
193 |             "Downloading model.safetensors:  24% 944M/3.90G [00:08<00:42, 70.2MB/s]\u001b[A\u001b[A\n",
194 |             "\n",
195 |             "Downloading model.safetensors:  25% 975M/3.90G [00:08<00:30, 94.3MB/s]\u001b[A\u001b[A\n",
196 |             "\n",
197 |             "Downloading model.safetensors:  26% 996M/3.90G [00:08<00:27, 107MB/s] \u001b[A\u001b[A\n",
198 |             "\n",
199 |             "Downloading model.safetensors:  26% 1.02G/3.90G [00:08<00:23, 121MB/s]\u001b[A\u001b[A\n",
200 |             "\n",
201 |             "Downloading model.safetensors:  27% 1.04G/3.90G [00:08<00:21, 134MB/s]\u001b[A\u001b[A\n",
202 |             "\n",
203 |             "Downloading model.safetensors:  27% 1.06G/3.90G [00:08<00:20, 141MB/s]\u001b[A\u001b[A\n",
204 |             "\n",
205 |             "Downloading model.safetensors:  28% 1.08G/3.90G [00:09<00:18, 151MB/s]\u001b[A\u001b[A\n",
206 |             "\n",
207 |             "Downloading model.safetensors:  28% 1.10G/3.90G [00:09<00:17, 160MB/s]\u001b[A\u001b[A\n",
208 |             "\n",
209 |             "Downloading model.safetensors:  29% 1.12G/3.90G [00:09<00:16, 166MB/s]\u001b[A\u001b[A\n",
210 |             "\n",
211 |             "Downloading model.safetensors:  29% 1.14G/3.90G [00:09<00:16, 171MB/s]\u001b[A\u001b[A\n",
212 |             "\n",
213 |             "Downloading model.safetensors:  30% 1.16G/3.90G [00:09<00:15, 175MB/s]\u001b[A\u001b[A\n",
214 |             "\n",
215 |             "Downloading model.safetensors:  30% 1.18G/3.90G [00:09<00:15, 178MB/s]\u001b[A\u001b[A\n",
216 |             "\n",
217 |             "Downloading model.safetensors:  31% 1.21G/3.90G [00:09<00:15, 179MB/s]\u001b[A\u001b[A\n",
218 |             "\n",
219 |             "Downloading model.safetensors:  31% 1.23G/3.90G [00:09<00:14, 181MB/s]\u001b[A\u001b[A\n",
220 |             "\n",
221 |             "Downloading model.safetensors:  32% 1.25G/3.90G [00:09<00:14, 182MB/s]\u001b[A\u001b[A\n",
222 |             "\n",
223 |             "Downloading model.safetensors:  33% 1.27G/3.90G [00:10<00:23, 113MB/s]\u001b[A\u001b[A\n",
224 |             "\n",
225 |             "Downloading model.safetensors:  33% 1.29G/3.90G [00:10<00:20, 128MB/s]\u001b[A\u001b[A\n",
226 |             "\n",
227 |             "Downloading model.safetensors:  34% 1.31G/3.90G [00:10<00:18, 139MB/s]\u001b[A\u001b[A\n",
228 |             "\n",
229 |             "Downloading model.safetensors:  34% 1.33G/3.90G [00:10<00:17, 150MB/s]\u001b[A\u001b[A\n",
230 |             "\n",
231 |             "Downloading model.safetensors:  35% 1.35G/3.90G [00:10<00:16, 158MB/s]\u001b[A\u001b[A\n",
232 |             "\n",
233 |             "Downloading model.safetensors:  35% 1.37G/3.90G [00:12<01:24, 29.9MB/s]\u001b[A\u001b[A\n",
234 |             "\n",
235 |             "Downloading model.safetensors:  36% 1.41G/3.90G [00:12<00:55, 45.3MB/s]\u001b[A\u001b[A\n",
236 |             "\n",
237 |             "Downloading model.safetensors:  37% 1.44G/3.90G [00:13<00:39, 63.0MB/s]\u001b[A\u001b[A\n",
238 |             "\n",
239 |             "Downloading model.safetensors:  37% 1.46G/3.90G [00:13<00:33, 72.6MB/s]\u001b[A\u001b[A\n",
240 |             "\n",
241 |             "Downloading model.safetensors:  38% 1.48G/3.90G [00:13<00:29, 82.0MB/s]\u001b[A\u001b[A\n",
242 |             "\n",
243 |             "Downloading model.safetensors:  38% 1.50G/3.90G [00:13<00:24, 98.6MB/s]\u001b[A\u001b[A\n",
244 |             "\n",
245 |             "Downloading model.safetensors:  39% 1.53G/3.90G [00:13<00:19, 124MB/s] \u001b[A\u001b[A\n",
246 |             "\n",
247 |             "Downloading model.safetensors:  40% 1.55G/3.90G [00:13<00:17, 132MB/s]\u001b[A\u001b[A\n",
248 |             "\n",
249 |             "Downloading model.safetensors:  40% 1.57G/3.90G [00:13<00:16, 143MB/s]\u001b[A\u001b[A\n",
250 |             "\n",
251 |             "Downloading model.safetensors:  41% 1.59G/3.90G [00:14<00:15, 153MB/s]\u001b[A\u001b[A\n",
252 |             "\n",
253 |             "Downloading model.safetensors:  41% 1.61G/3.90G [00:14<00:14, 160MB/s]\u001b[A\u001b[A\n",
254 |             "\n",
255 |             "Downloading model.safetensors:  42% 1.64G/3.90G [00:14<00:13, 167MB/s]\u001b[A\u001b[A\n",
256 |             "\n",
257 |             "Downloading model.safetensors:  43% 1.66G/3.90G [00:14<00:13, 171MB/s]\u001b[A\u001b[A\n",
258 |             "\n",
259 |             "Downloading model.safetensors:  43% 1.68G/3.90G [00:14<00:12, 177MB/s]\u001b[A\u001b[A\n",
260 |             "\n",
261 |             "Downloading model.safetensors:  44% 1.70G/3.90G [00:14<00:12, 174MB/s]\u001b[A\u001b[A\n",
262 |             "\n",
263 |             "Downloading model.safetensors:  44% 1.72G/3.90G [00:14<00:12, 173MB/s]\u001b[A\u001b[A\n",
264 |             "\n",
265 |             "Downloading model.safetensors:  45% 1.74G/3.90G [00:14<00:12, 175MB/s]\u001b[A\u001b[A\n",
266 |             "\n",
267 |             "Downloading model.safetensors:  45% 1.76G/3.90G [00:14<00:11, 179MB/s]\u001b[A\u001b[A\n",
268 |             "\n",
269 |             "Downloading model.safetensors:  46% 1.78G/3.90G [00:15<00:12, 172MB/s]\u001b[A\u001b[A\n",
270 |             "\n",
271 |             "Downloading model.safetensors:  46% 1.80G/3.90G [00:15<00:12, 174MB/s]\u001b[A\u001b[A\n",
272 |             "\n",
273 |             "Downloading model.safetensors:  47% 1.82G/3.90G [00:15<00:11, 177MB/s]\u001b[A\u001b[A\n",
274 |             "\n",
275 |             "Downloading model.safetensors:  47% 1.85G/3.90G [00:16<00:28, 71.9MB/s]\u001b[A\u001b[A\n",
276 |             "\n",
277 |             "Downloading model.safetensors:  48% 1.87G/3.90G [00:16<00:23, 87.4MB/s]\u001b[A\u001b[A\n",
278 |             "\n",
279 |             "Downloading model.safetensors:  49% 1.90G/3.90G [00:16<00:16, 118MB/s] \u001b[A\u001b[A\n",
280 |             "\n",
281 |             "Downloading model.safetensors:  49% 1.92G/3.90G [00:16<00:14, 132MB/s]\u001b[A\u001b[A\n",
282 |             "\n",
283 |             "Downloading model.safetensors:  50% 1.94G/3.90G [00:16<00:13, 143MB/s]\u001b[A\u001b[A\n",
284 |             "\n",
285 |             "Downloading model.safetensors:  50% 1.96G/3.90G [00:16<00:12, 152MB/s]\u001b[A\u001b[A\n",
286 |             "\n",
287 |             "Downloading model.safetensors:  51% 1.98G/3.90G [00:16<00:13, 142MB/s]\u001b[A\u001b[A\n",
288 |             "\n",
289 |             "Downloading model.safetensors:  51% 2.00G/3.90G [00:16<00:13, 144MB/s]\u001b[A\u001b[A\n",
290 |             "\n",
291 |             "Downloading model.safetensors:  52% 2.02G/3.90G [00:17<00:12, 144MB/s]\u001b[A\u001b[A\n",
292 |             "\n",
293 |             "Downloading model.safetensors:  52% 2.04G/3.90G [00:17<00:12, 148MB/s]\u001b[A\u001b[A\n",
294 |             "\n",
295 |             "Downloading model.safetensors:  53% 2.07G/3.90G [00:17<00:12, 152MB/s]\u001b[A\u001b[A\n",
296 |             "\n",
297 |             "Downloading model.safetensors:  54% 2.09G/3.90G [00:17<00:22, 81.2MB/s]\u001b[A\u001b[A\n",
298 |             "\n",
299 |             "Downloading model.safetensors:  54% 2.12G/3.90G [00:18<00:16, 107MB/s] \u001b[A\u001b[A\n",
300 |             "\n",
301 |             "Downloading model.safetensors:  55% 2.14G/3.90G [00:18<00:14, 119MB/s]\u001b[A\u001b[A\n",
302 |             "\n",
303 |             "Downloading model.safetensors:  55% 2.16G/3.90G [00:18<00:14, 123MB/s]\u001b[A\u001b[A\n",
304 |             "\n",
305 |             "Downloading model.safetensors:  56% 2.18G/3.90G [00:18<00:13, 131MB/s]\u001b[A\u001b[A\n",
306 |             "\n",
307 |             "Downloading model.safetensors:  57% 2.21G/3.90G [00:18<00:10, 156MB/s]\u001b[A\u001b[A\n",
308 |             "\n",
309 |             "Downloading model.safetensors:  57% 2.23G/3.90G [00:18<00:10, 162MB/s]\u001b[A\u001b[A\n",
310 |             "\n",
311 |             "Downloading model.safetensors:  58% 2.25G/3.90G [00:18<00:10, 160MB/s]\u001b[A\u001b[A\n",
312 |             "\n",
313 |             "Downloading model.safetensors:  59% 2.29G/3.90G [00:18<00:09, 174MB/s]\u001b[A\u001b[A\n",
314 |             "\n",
315 |             "Downloading model.safetensors:  59% 2.31G/3.90G [00:19<00:08, 178MB/s]\u001b[A\u001b[A\n",
316 |             "\n",
317 |             "Downloading model.safetensors:  60% 2.33G/3.90G [00:19<00:08, 180MB/s]\u001b[A\u001b[A\n",
318 |             "\n",
319 |             "Downloading model.safetensors:  60% 2.35G/3.90G [00:19<00:08, 181MB/s]\u001b[A\u001b[A\n",
320 |             "\n",
321 |             "Downloading model.safetensors:  61% 2.37G/3.90G [00:19<00:08, 181MB/s]\u001b[A\u001b[A\n",
322 |             "\n",
323 |             "Downloading model.safetensors:  61% 2.39G/3.90G [00:19<00:08, 181MB/s]\u001b[A\u001b[A\n",
324 |             "\n",
325 |             "Downloading model.safetensors:  62% 2.41G/3.90G [00:19<00:08, 182MB/s]\u001b[A\u001b[A\n",
326 |             "\n",
327 |             "Downloading model.safetensors:  62% 2.43G/3.90G [00:19<00:08, 182MB/s]\u001b[A\u001b[A\n",
328 |             "\n",
329 |             "Downloading model.safetensors:  63% 2.45G/3.90G [00:19<00:08, 177MB/s]\u001b[A\u001b[A\n",
330 |             "\n",
331 |             "Downloading model.safetensors:  64% 2.47G/3.90G [00:20<00:11, 124MB/s]\u001b[A\u001b[A\n",
332 |             "\n",
333 |             "Downloading model.safetensors:  64% 2.51G/3.90G [00:20<00:09, 149MB/s]\u001b[A\u001b[A\n",
334 |             "\n",
335 |             "Downloading model.safetensors:  65% 2.53G/3.90G [00:22<00:40, 34.2MB/s]\u001b[A\u001b[A\n",
336 |             "\n",
337 |             "Downloading model.safetensors:  66% 2.56G/3.90G [00:22<00:26, 50.1MB/s]\u001b[A\u001b[A\n",
338 |             "\n",
339 |             "Downloading model.safetensors:  66% 2.58G/3.90G [00:22<00:21, 60.1MB/s]\u001b[A\u001b[A\n",
340 |             "\n",
341 |             "Downloading model.safetensors:  67% 2.60G/3.90G [00:22<00:18, 69.4MB/s]\u001b[A\u001b[A\n",
342 |             "\n",
343 |             "Downloading model.safetensors:  67% 2.62G/3.90G [00:22<00:15, 84.0MB/s]\u001b[A\u001b[A\n",
344 |             "\n",
345 |             "Downloading model.safetensors:  68% 2.64G/3.90G [00:22<00:12, 99.4MB/s]\u001b[A\u001b[A\n",
346 |             "\n",
347 |             "Downloading model.safetensors:  68% 2.66G/3.90G [00:23<00:12, 96.0MB/s]\u001b[A\u001b[A\n",
348 |             "\n",
349 |             "Downloading model.safetensors:  69% 2.68G/3.90G [00:23<00:12, 95.4MB/s]\u001b[A\u001b[A\n",
350 |             "\n",
351 |             "Downloading model.safetensors:  69% 2.71G/3.90G [00:23<00:14, 84.2MB/s]\u001b[A\u001b[A\n",
352 |             "\n",
353 |             "Downloading model.safetensors:  70% 2.73G/3.90G [00:23<00:14, 82.0MB/s]\u001b[A\u001b[A\n",
354 |             "\n",
355 |             "Downloading model.safetensors:  70% 2.74G/3.90G [00:24<00:14, 80.9MB/s]\u001b[A\u001b[A\n",
356 |             "\n",
357 |             "Downloading model.safetensors:  70% 2.75G/3.90G [00:24<00:15, 75.8MB/s]\u001b[A\u001b[A\n",
358 |             "\n",
359 |             "Downloading model.safetensors:  71% 2.76G/3.90G [00:24<00:15, 75.3MB/s]\u001b[A\u001b[A\n",
360 |             "\n",
361 |             "Downloading model.safetensors:  71% 2.77G/3.90G [00:24<00:15, 72.2MB/s]\u001b[A\u001b[A\n",
362 |             "\n",
363 |             "Downloading model.safetensors:  71% 2.78G/3.90G [00:24<00:14, 74.9MB/s]\u001b[A\u001b[A\n",
364 |             "\n",
365 |             "Downloading model.safetensors:  72% 2.79G/3.90G [00:24<00:14, 74.7MB/s]\u001b[A\u001b[A\n",
366 |             "\n",
367 |             "Downloading model.safetensors:  72% 2.80G/3.90G [00:25<00:15, 69.4MB/s]\u001b[A\u001b[A\n",
368 |             "\n",
369 |             "Downloading model.safetensors:  72% 2.81G/3.90G [00:25<00:15, 71.3MB/s]\u001b[A\u001b[A\n",
370 |             "\n",
371 |             "Downloading model.safetensors:  72% 2.82G/3.90G [00:25<00:13, 77.5MB/s]\u001b[A\u001b[A\n",
372 |             "\n",
373 |             "Downloading model.safetensors:  73% 2.84G/3.90G [00:25<00:12, 84.6MB/s]\u001b[A\u001b[A\n",
374 |             "\n",
375 |             "Downloading model.safetensors:  73% 2.85G/3.90G [00:25<00:12, 83.8MB/s]\u001b[A\u001b[A\n",
376 |             "\n",
377 |             "Downloading model.safetensors:  73% 2.86G/3.90G [00:25<00:12, 81.6MB/s]\u001b[A\u001b[A\n",
378 |             "\n",
379 |             "Downloading model.safetensors:  74% 2.88G/3.90G [00:25<00:10, 97.2MB/s]\u001b[A\u001b[A\n",
380 |             "\n",
381 |             "Downloading model.safetensors:  75% 2.90G/3.90G [00:26<00:08, 118MB/s] \u001b[A\u001b[A\n",
382 |             "\n",
383 |             "Downloading model.safetensors:  75% 2.93G/3.90G [00:26<00:07, 134MB/s]\u001b[A\u001b[A\n",
384 |             "\n",
385 |             "Downloading model.safetensors:  76% 2.95G/3.90G [00:26<00:06, 149MB/s]\u001b[A\u001b[A\n",
386 |             "\n",
387 |             "Downloading model.safetensors:  76% 2.97G/3.90G [00:26<00:05, 159MB/s]\u001b[A\u001b[A\n",
388 |             "\n",
389 |             "Downloading model.safetensors:  77% 2.99G/3.90G [00:27<00:23, 37.9MB/s]\u001b[A\u001b[A\n",
390 |             "\n",
391 |             "Downloading model.safetensors:  77% 3.02G/3.90G [00:27<00:15, 57.4MB/s]\u001b[A\u001b[A\n",
392 |             "\n",
393 |             "Downloading model.safetensors:  78% 3.04G/3.90G [00:28<00:12, 67.9MB/s]\u001b[A\u001b[A\n",
394 |             "\n",
395 |             "Downloading model.safetensors:  79% 3.06G/3.90G [00:28<00:10, 78.8MB/s]\u001b[A\u001b[A\n",
396 |             "\n",
397 |             "Downloading model.safetensors:  79% 3.08G/3.90G [00:28<00:08, 92.9MB/s]\u001b[A\u001b[A\n",
398 |             "\n",
399 |             "Downloading model.safetensors:  80% 3.10G/3.90G [00:28<00:07, 109MB/s] \u001b[A\u001b[A\n",
400 |             "\n",
401 |             "Downloading model.safetensors:  80% 3.14G/3.90G [00:28<00:05, 138MB/s]\u001b[A\u001b[A\n",
402 |             "\n",
403 |             "Downloading model.safetensors:  81% 3.16G/3.90G [00:28<00:05, 146MB/s]\u001b[A\u001b[A\n",
404 |             "\n",
405 |             "Downloading model.safetensors:  82% 3.18G/3.90G [00:28<00:04, 152MB/s]\u001b[A\u001b[A\n",
406 |             "\n",
407 |             "Downloading model.safetensors:  82% 3.20G/3.90G [00:29<00:04, 161MB/s]\u001b[A\u001b[A\n",
408 |             "\n",
409 |             "Downloading model.safetensors:  83% 3.22G/3.90G [00:29<00:03, 170MB/s]\u001b[A\u001b[A\n",
410 |             "\n",
411 |             "Downloading model.safetensors:  83% 3.24G/3.90G [00:29<00:04, 158MB/s]\u001b[A\u001b[A\n",
412 |             "\n",
413 |             "Downloading model.safetensors:  84% 3.26G/3.90G [00:29<00:04, 156MB/s]\u001b[A\u001b[A\n",
414 |             "\n",
415 |             "Downloading model.safetensors:  84% 3.28G/3.90G [00:29<00:03, 160MB/s]\u001b[A\u001b[A\n",
416 |             "\n",
417 |             "Downloading model.safetensors:  85% 3.30G/3.90G [00:29<00:03, 162MB/s]\u001b[A\u001b[A\n",
418 |             "\n",
419 |             "Downloading model.safetensors:  85% 3.32G/3.90G [00:29<00:03, 160MB/s]\u001b[A\u001b[A\n",
420 |             "\n",
421 |             "Downloading model.safetensors:  86% 3.34G/3.90G [00:29<00:03, 171MB/s]\u001b[A\u001b[A\n",
422 |             "\n",
423 |             "Downloading model.safetensors:  87% 3.38G/3.90G [00:30<00:02, 191MB/s]\u001b[A\u001b[A\n",
424 |             "\n",
425 |             "Downloading model.safetensors:  87% 3.40G/3.90G [00:30<00:02, 188MB/s]\u001b[A\u001b[A\n",
426 |             "\n",
427 |             "Downloading model.safetensors:  88% 3.42G/3.90G [00:30<00:02, 187MB/s]\u001b[A\u001b[A\n",
428 |             "\n",
429 |             "Downloading model.safetensors:  88% 3.44G/3.90G [00:30<00:02, 182MB/s]\u001b[A\u001b[A\n",
430 |             "\n",
431 |             "Downloading model.safetensors:  89% 3.46G/3.90G [00:30<00:02, 183MB/s]\u001b[A\u001b[A\n",
432 |             "\n",
433 |             "Downloading model.safetensors:  89% 3.48G/3.90G [00:30<00:02, 183MB/s]\u001b[A\u001b[A\n",
434 |             "\n",
435 |             "Downloading model.safetensors:  90% 3.50G/3.90G [00:30<00:02, 184MB/s]\u001b[A\u001b[A\n",
436 |             "\n",
437 |             "Downloading model.safetensors:  90% 3.52G/3.90G [00:30<00:02, 185MB/s]\u001b[A\u001b[A\n",
438 |             "\n",
439 |             "Downloading model.safetensors:  91% 3.54G/3.90G [00:30<00:01, 183MB/s]\u001b[A\u001b[A\n",
440 |             "\n",
441 |             "Downloading model.safetensors:  91% 3.57G/3.90G [00:31<00:05, 55.5MB/s]\u001b[A\u001b[A\n",
442 |             "\n",
443 |             "Downloading model.safetensors:  92% 3.59G/3.90G [00:32<00:08, 38.3MB/s]\u001b[A\u001b[A\n",
444 |             "\n",
445 |             "Downloading model.safetensors:  93% 3.61G/3.90G [00:32<00:05, 50.7MB/s]\u001b[A\u001b[A\n",
446 |             "\n",
447 |             "Downloading model.safetensors:  93% 3.63G/3.90G [00:33<00:04, 65.0MB/s]\u001b[A\u001b[A\n",
448 |             "\n",
449 |             "Downloading model.safetensors:  94% 3.65G/3.90G [00:33<00:03, 80.3MB/s]\u001b[A\u001b[A\n",
450 |             "\n",
451 |             "Downloading model.safetensors:  94% 3.67G/3.90G [00:33<00:02, 97.3MB/s]\u001b[A\u001b[A\n",
452 |             "\n",
453 |             "Downloading model.safetensors:  95% 3.69G/3.90G [00:33<00:01, 113MB/s] \u001b[A\u001b[A\n",
454 |             "\n",
455 |             "Downloading model.safetensors:  95% 3.71G/3.90G [00:33<00:01, 128MB/s]\u001b[A\u001b[A\n",
456 |             "\n",
457 |             "Downloading model.safetensors:  96% 3.73G/3.90G [00:33<00:01, 139MB/s]\u001b[A\u001b[A\n",
458 |             "\n",
459 |             "Downloading model.safetensors:  96% 3.75G/3.90G [00:33<00:00, 153MB/s]\u001b[A\u001b[A\n",
460 |             "\n",
461 |             "Downloading model.safetensors:  97% 3.77G/3.90G [00:33<00:00, 158MB/s]\u001b[A\u001b[A\n",
462 |             "\n",
463 |             "Downloading model.safetensors:  97% 3.80G/3.90G [00:34<00:00, 165MB/s]\u001b[A\u001b[A\n",
464 |             "\n",
465 |             "Downloading model.safetensors:  98% 3.82G/3.90G [00:34<00:00, 167MB/s]\u001b[A\u001b[A\n",
466 |             "\n",
467 |             "Downloading model.safetensors:  98% 3.84G/3.90G [00:34<00:00, 169MB/s]\u001b[A\u001b[A\n",
468 |             "\n",
469 |             "Downloading model.safetensors:  99% 3.86G/3.90G [00:34<00:00, 174MB/s]\u001b[A\u001b[A\n",
470 |             "\n",
471 |             "Downloading model.safetensors: 100% 3.90G/3.90G [00:34<00:00, 113MB/s]\n",
472 |             "Fetching 15 files: 100% 15/15 [00:36<00:00,  2.41s/it]\n",
473 |             "/content/llama2-webui\n",
474 |             "Running on GPU with backend torch transformers.\n",
475 |             "2023-08-26 07:14:25.222792: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
476 |             "skip module injection for FusedLlamaMLPForQuantizedModel not support integrate without triton yet.\n",
477 |             "Caching examples at: '/content/llama2-webui/gradio_cached_examples/19'\n",
478 |             "Caching example 1/5\n",
479 |             "Caching example 2/5\n",
480 |             "Caching example 3/5\n",
481 |             "Caching example 4/5\n",
482 |             "Caching example 5/5\n",
483 |             "Caching complete\n",
484 |             "\n",
485 |             "Running on local URL:  http://127.0.0.1:7860\n",
486 |             "Running on public URL: https://71c3606942c440e7dd.gradio.live\n",
487 |             "\n",
488 |             "This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)\n",
489 |             "Keyboard interruption in main thread... closing server.\n",
490 |             "Traceback (most recent call last):\n",
491 |             "  File \"/usr/local/lib/python3.10/dist-packages/gradio/blocks.py\", line 2130, in block_thread\n",
492 |             "    time.sleep(0.1)\n",
493 |             "KeyboardInterrupt\n",
494 |             "\n",
495 |             "During handling of the above exception, another exception occurred:\n",
496 |             "\n",
497 |             "Traceback (most recent call last):\n",
498 |             "  File \"/content/llama2-webui/app.py\", line 322, in <module>\n",
499 |             "    main()\n",
500 |             "  File \"/content/llama2-webui/app.py\", line 318, in main\n",
501 |             "    demo.queue(max_size=20).launch(share=args.share)\n",
502 |             "  File \"/usr/local/lib/python3.10/dist-packages/gradio/blocks.py\", line 2046, in launch\n",
503 |             "    self.block_thread()\n",
504 |             "  File \"/usr/local/lib/python3.10/dist-packages/gradio/blocks.py\", line 2132, in block_thread\n",
505 |             "    print(\"Keyboard interruption in main thread... closing server.\")\n",
506 |             "KeyboardInterrupt\n",
507 |             "Killing tunnel 127.0.0.1:7860 <> https://71c3606942c440e7dd.gradio.live\n",
508 |             "terminate called without an active exception\n"
509 |           ]
510 |         }
511 |       ]
512 |     }
513 |   ]
514 | }


--------------------------------------------------------------------------------
/llama2_wrapper/model.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import uuid
  4 | from enum import Enum
  5 | from threading import Thread
  6 | from typing import Any, Iterator, Union, List
  7 | from llama2_wrapper.types import (
  8 |     Completion,
  9 |     CompletionChunk,
 10 |     ChatCompletion,
 11 |     ChatCompletionChunk,
 12 |     # ChatCompletionMessage,
 13 |     Message,
 14 |     B_INST,
 15 |     E_INST,
 16 |     B_SYS,
 17 |     E_SYS,
 18 | )
 19 | 
 20 | 
 21 | class LLAMA2_WRAPPER:
 22 |     def __init__(
 23 |         self,
 24 |         model_path: str = "",
 25 |         backend_type: str = "llama.cpp",
 26 |         max_tokens: int = 4000,
 27 |         load_in_8bit: bool = True,
 28 |         verbose: bool = False,
 29 |     ):
 30 |         """Load a llama2 model from `model_path`.
 31 | 
 32 |         Args:
 33 |             model_path: Path to the model.
 34 |             backend_type: Backend for llama2, options: llama.cpp, gptq, transformers
 35 |             max_tokens: Maximum context size.
 36 |             load_in_8bit: Use bitsandbytes to run model in 8 bit mode (only for transformers models).
 37 |             verbose: Print verbose output to stderr.
 38 | 
 39 |         Raises:
 40 |             ValueError: If the model path does not exist.
 41 | 
 42 |         Returns:
 43 |             A LLAMA2_WRAPPER instance.
 44 |         """
 45 |         self.model_path = model_path
 46 |         self.backend_type = BackendType.get_type(backend_type)
 47 |         self.max_tokens = max_tokens
 48 |         self.load_in_8bit = load_in_8bit
 49 | 
 50 |         self.model = None
 51 |         self.tokenizer = None
 52 | 
 53 |         self.verbose = verbose
 54 | 
 55 |         if self.backend_type is BackendType.LLAMA_CPP:
 56 |             print("Running on backend llama.cpp.")
 57 |         else:
 58 |             import torch
 59 | 
 60 |             if torch.cuda.is_available():
 61 |                 print("Running on GPU with backend torch transformers.")
 62 |             else:
 63 |                 print("GPU CUDA not found.")
 64 | 
 65 |         self.default_llamacpp_path = "./models/llama-2-7b-chat.Q4_0.gguf"
 66 |         self.default_gptq_path = "./models/Llama-2-7b-Chat-GPTQ"
 67 |         # Download default ggml/gptq model
 68 |         if self.model_path == "":
 69 |             print("Model path is empty.")
 70 |             if self.backend_type is BackendType.LLAMA_CPP:
 71 |                 print("Use default llama.cpp model path: " + self.default_llamacpp_path)
 72 |                 if not os.path.exists(self.default_llamacpp_path):
 73 |                     print("Start downloading model to: " + self.default_llamacpp_path)
 74 |                     from huggingface_hub import hf_hub_download
 75 | 
 76 |                     hf_hub_download(
 77 |                         repo_id="TheBloke/Llama-2-7b-Chat-GGUF",
 78 |                         filename="llama-2-7b-chat.Q4_0.gguf",
 79 |                         local_dir="./models/",
 80 |                     )
 81 |                 else:
 82 |                     print("Model exists in ./models/llama-2-7b-chat.Q4_0.gguf.")
 83 |                 self.model_path = self.default_llamacpp_path
 84 |             elif self.backend_type is BackendType.GPTQ:
 85 |                 print("Use default gptq model path: " + self.default_gptq_path)
 86 |                 if not os.path.exists(self.default_gptq_path):
 87 |                     print("Start downloading model to: " + self.default_gptq_path)
 88 |                     from huggingface_hub import snapshot_download
 89 | 
 90 |                     snapshot_download(
 91 |                         "TheBloke/Llama-2-7b-Chat-GPTQ",
 92 |                         local_dir=self.default_gptq_path,
 93 |                     )
 94 |                 else:
 95 |                     print("Model exists in " + self.default_gptq_path)
 96 |                 self.model_path = self.default_gptq_path
 97 | 
 98 |         self.init_tokenizer()
 99 |         self.init_model()
100 | 
101 |     def init_model(self):
102 |         if self.model is None:
103 |             self.model = LLAMA2_WRAPPER.create_llama2_model(
104 |                 self.model_path,
105 |                 self.backend_type,
106 |                 self.max_tokens,
107 |                 self.load_in_8bit,
108 |                 self.verbose,
109 |             )
110 |         if self.backend_type is not BackendType.LLAMA_CPP:
111 |             self.model.eval()
112 | 
113 |     def init_tokenizer(self):
114 |         if self.backend_type is not BackendType.LLAMA_CPP:
115 |             if self.tokenizer is None:
116 |                 self.tokenizer = LLAMA2_WRAPPER.create_llama2_tokenizer(self.model_path)
117 | 
118 |     @classmethod
119 |     def create_llama2_model(
120 |         cls, model_path, backend_type, max_tokens, load_in_8bit, verbose
121 |     ):
122 |         if backend_type is BackendType.LLAMA_CPP:
123 |             from llama_cpp import Llama
124 | 
125 |             model = Llama(
126 |                 model_path=model_path,
127 |                 n_ctx=max_tokens,
128 |                 n_batch=max_tokens,
129 |                 verbose=verbose,
130 |             )
131 |         elif backend_type is BackendType.GPTQ:
132 |             from auto_gptq import AutoGPTQForCausalLM
133 | 
134 |             model = AutoGPTQForCausalLM.from_quantized(
135 |                 model_path,
136 |                 use_safetensors=True,
137 |                 trust_remote_code=True,
138 |                 device="cuda:0",
139 |                 use_triton=False,
140 |                 quantize_config=None,
141 |             )
142 |         elif backend_type is BackendType.TRANSFORMERS:
143 |             import torch
144 |             from transformers import AutoModelForCausalLM
145 | 
146 |             model = AutoModelForCausalLM.from_pretrained(
147 |                 model_path,
148 |                 device_map="auto",
149 |                 torch_dtype=torch.float16,
150 |                 load_in_8bit=load_in_8bit,
151 |             )
152 |         else:
153 |             print(backend_type + "not implemented.")
154 |         return model
155 | 
156 |     @classmethod
157 |     def create_llama2_tokenizer(cls, model_path):
158 |         from transformers import AutoTokenizer
159 | 
160 |         tokenizer = AutoTokenizer.from_pretrained(model_path)
161 |         return tokenizer
162 | 
163 |     def get_token_length(
164 |         self,
165 |         prompt: str,
166 |     ) -> int:
167 |         if self.backend_type is BackendType.LLAMA_CPP:
168 |             input_ids = self.model.tokenize(bytes(prompt, "utf-8"))
169 |             return len(input_ids)
170 |         else:
171 |             input_ids = self.tokenizer([prompt], return_tensors="np")["input_ids"]
172 |             return input_ids.shape[-1]
173 | 
174 |     def get_input_token_length(
175 |         self,
176 |         message: str,
177 |         chat_history: list[tuple[str, str]] = [],
178 |         system_prompt: str = "",
179 |     ) -> int:
180 |         prompt = get_prompt(message, chat_history, system_prompt)
181 | 
182 |         return self.get_token_length(prompt)
183 | 
184 |     def generate(
185 |         self,
186 |         prompt: str,
187 |         max_new_tokens: int = 1000,
188 |         temperature: float = 0.9,
189 |         top_p: float = 1.0,
190 |         top_k: int = 40,
191 |         repetition_penalty: float = 1.0,
192 |         **kwargs: Any,
193 |     ) -> Iterator[str]:
194 |         """Create a generator of response from a prompt.
195 | 
196 |         Examples:
197 |             >>> llama2_wrapper = LLAMA2_WRAPPER()
198 |             >>> prompt = get_prompt("Hi do you know Pytorch?")
199 |             >>> for response in llama2_wrapper.generate(prompt):
200 |             ...     print(response)
201 | 
202 |         Args:
203 |             prompt: The prompt to generate text from.
204 |             max_new_tokens: The maximum number of tokens to generate.
205 |             temperature: The temperature to use for sampling.
206 |             top_p: The top-p value to use for sampling.
207 |             top_k: The top-k value to use for sampling.
208 |             repetition_penalty: The penalty to apply to repeated tokens.
209 |             kwargs: all other arguments.
210 | 
211 |         Yields:
212 |             The generated text.
213 |         """
214 |         if self.backend_type is BackendType.LLAMA_CPP:
215 |             result = self.model(
216 |                 prompt=prompt,
217 |                 stream=True,
218 |                 max_tokens=max_new_tokens,
219 |                 top_k=top_k,
220 |                 top_p=top_p,
221 |                 temperature=temperature,
222 |                 repeat_penalty=repetition_penalty,
223 |                 **kwargs,
224 |             )
225 |             outputs = []
226 |             for part in result:
227 |                 text = part["choices"][0]["text"]
228 |                 outputs.append(text)
229 |                 yield "".join(outputs)
230 |         else:
231 |             from transformers import TextIteratorStreamer
232 | 
233 |             inputs = self.tokenizer([prompt], return_tensors="pt").to("cuda")
234 | 
235 |             streamer = TextIteratorStreamer(
236 |                 self.tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True
237 |             )
238 |             generate_kwargs = dict(
239 |                 inputs,
240 |                 streamer=streamer,
241 |                 max_new_tokens=max_new_tokens,
242 |                 temperature=temperature,
243 |                 top_p=top_p,
244 |                 top_k=top_k,
245 |                 repetition_penalty=repetition_penalty,
246 |                 # num_beams=1,
247 |             )
248 |             generate_kwargs = (
249 |                 generate_kwargs if kwargs is None else {**generate_kwargs, **kwargs}
250 |             )
251 |             t = Thread(target=self.model.generate, kwargs=generate_kwargs)
252 |             t.start()
253 | 
254 |             outputs = []
255 |             for text in streamer:
256 |                 outputs.append(text)
257 |                 yield "".join(outputs)
258 | 
259 |     def run(
260 |         self,
261 |         message: str,
262 |         chat_history: list[tuple[str, str]] = [],
263 |         system_prompt: str = "",
264 |         max_new_tokens: int = 1000,
265 |         temperature: float = 0.9,
266 |         top_p: float = 1.0,
267 |         top_k: int = 40,
268 |         repetition_penalty: float = 1.0,
269 |     ) -> Iterator[str]:
270 |         """Create a generator of response from a chat message.
271 |         Process message to llama2 prompt with chat history
272 |         and system_prompt for chatbot.
273 | 
274 |         Args:
275 |             message: The origianl chat message to generate text from.
276 |             chat_history: Chat history list from chatbot.
277 |             system_prompt: System prompt for chatbot.
278 |             max_new_tokens: The maximum number of tokens to generate.
279 |             temperature: The temperature to use for sampling.
280 |             top_p: The top-p value to use for sampling.
281 |             top_k: The top-k value to use for sampling.
282 |             repetition_penalty: The penalty to apply to repeated tokens.
283 |             kwargs: all other arguments.
284 | 
285 |         Yields:
286 |             The generated text.
287 |         """
288 |         prompt = get_prompt(message, chat_history, system_prompt)
289 |         return self.generate(
290 |             prompt, max_new_tokens, temperature, top_p, top_k, repetition_penalty
291 |         )
292 | 
293 |     def __call__(
294 |         self,
295 |         prompt: str,
296 |         stream: bool = False,
297 |         max_new_tokens: int = 1000,
298 |         temperature: float = 0.9,
299 |         top_p: float = 1.0,
300 |         top_k: int = 40,
301 |         repetition_penalty: float = 1.0,
302 |         **kwargs: Any,
303 |     ) -> Union[str, Iterator[str]]:
304 |         """Generate text from a prompt.
305 | 
306 |         Examples:
307 |             >>> llama2_wrapper = LLAMA2_WRAPPER()
308 |             >>> prompt = get_prompt("Hi do you know Pytorch?")
309 |             >>> print(llama2_wrapper(prompt))
310 | 
311 |         Args:
312 |             prompt: The prompt to generate text from.
313 |             stream: Whether to stream the results.
314 |             max_new_tokens: The maximum number of tokens to generate.
315 |             temperature: The temperature to use for sampling.
316 |             top_p: The top-p value to use for sampling.
317 |             top_k: The top-k value to use for sampling.
318 |             repetition_penalty: The penalty to apply to repeated tokens.
319 |             kwargs: all other arguments.
320 | 
321 |         Raises:
322 |             ValueError: If the requested tokens exceed the context window.
323 |             RuntimeError: If the prompt fails to tokenize or the model fails to evaluate the prompt.
324 | 
325 |         Returns:
326 |             Generated text.
327 |         """
328 |         if self.backend_type is BackendType.LLAMA_CPP:
329 |             completion_or_chunks = self.model.__call__(
330 |                 prompt,
331 |                 stream=stream,
332 |                 max_tokens=max_new_tokens,
333 |                 temperature=temperature,
334 |                 top_p=top_p,
335 |                 top_k=top_k,
336 |                 repeat_penalty=repetition_penalty,
337 |                 **kwargs,
338 |             )
339 |             if stream:
340 | 
341 |                 def chunk_generator(chunks):
342 |                     for part in chunks:
343 |                         chunk = part["choices"][0]["text"]
344 |                         yield chunk
345 | 
346 |                 chunks: Iterator[str] = chunk_generator(completion_or_chunks)
347 |                 return chunks
348 |             return completion_or_chunks["choices"][0]["text"]
349 |         else:
350 |             inputs = self.tokenizer([prompt], return_tensors="pt").input_ids
351 |             prompt_tokens_len = len(inputs[0])
352 |             inputs = inputs.to("cuda")
353 |             generate_kwargs = dict(
354 |                 inputs=inputs,
355 |                 max_new_tokens=max_new_tokens,
356 |                 temperature=temperature,
357 |                 top_p=top_p,
358 |                 top_k=top_k,
359 |                 repetition_penalty=repetition_penalty,
360 |                 # num_beams=1,
361 |             )
362 |             generate_kwargs = (
363 |                 generate_kwargs if kwargs is None else {**generate_kwargs, **kwargs}
364 |             )
365 |             if stream:
366 |                 from transformers import TextIteratorStreamer
367 | 
368 |                 streamer = TextIteratorStreamer(
369 |                     self.tokenizer,
370 |                     timeout=10.0,
371 |                     skip_prompt=True,
372 |                     skip_special_tokens=True,
373 |                 )
374 |                 generate_kwargs["streamer"] = streamer
375 | 
376 |                 t = Thread(target=self.model.generate, kwargs=generate_kwargs)
377 |                 t.start()
378 |                 return streamer
379 |             else:
380 |                 output_ids = self.model.generate(
381 |                     **generate_kwargs,
382 |                 )
383 |                 # skip prompt, skip special tokens
384 |                 output = self.tokenizer.decode(
385 |                     output_ids[0][prompt_tokens_len:], skip_special_tokens=True
386 |                 )
387 |                 return output
388 | 
389 |     def completion(
390 |         self,
391 |         prompt: str,
392 |         stream: bool = False,
393 |         max_new_tokens: int = 1000,
394 |         temperature: float = 0.9,
395 |         top_p: float = 1.0,
396 |         top_k: int = 40,
397 |         repetition_penalty: float = 1.0,
398 |         **kwargs: Any,
399 |     ) -> Union[Completion, Iterator[CompletionChunk]]:
400 |         """For OpenAI compatible API /v1/completions
401 |         Generate text from a prompt.
402 | 
403 |         Examples:
404 |             >>> llama2_wrapper = LLAMA2_WRAPPER()
405 |             >>> prompt = get_prompt("Hi do you know Pytorch?")
406 |             >>> print(llm.completion(prompt))
407 | 
408 |         Args:
409 |             prompt: The prompt to generate text from.
410 |             stream: Whether to stream the results.
411 |             max_new_tokens: The maximum number of tokens to generate.
412 |             temperature: The temperature to use for sampling.
413 |             top_p: The top-p value to use for sampling.
414 |             top_k: The top-k value to use for sampling.
415 |             repetition_penalty: The penalty to apply to repeated tokens.
416 |             kwargs: all other arguments.
417 | 
418 |         Raises:
419 |             ValueError: If the requested tokens exceed the context window.
420 |             RuntimeError: If the prompt fails to tokenize or the model fails to evaluate the prompt.
421 | 
422 |         Returns:
423 |             Response object containing the generated text.
424 |         """
425 |         completion_id: str = f"cmpl-{str(uuid.uuid4())}"
426 |         created: int = int(time.time())
427 |         model_name: str = (
428 |             self.backend_type + " default model"
429 |             if self.model_path == ""
430 |             else self.model_path
431 |         )
432 |         if self.backend_type is BackendType.LLAMA_CPP:
433 |             completion_or_chunks = self.model.__call__(
434 |                 prompt,
435 |                 stream=stream,
436 |                 max_tokens=max_new_tokens,
437 |                 temperature=temperature,
438 |                 top_p=top_p,
439 |                 top_k=top_k,
440 |                 repeat_penalty=repetition_penalty,
441 |                 **kwargs,
442 |             )
443 |             if stream:
444 |                 chunks: Iterator[CompletionChunk] = completion_or_chunks
445 |                 return chunks
446 |             return completion_or_chunks
447 |         else:
448 |             inputs = self.tokenizer([prompt], return_tensors="pt").input_ids
449 |             prompt_tokens_len = len(inputs[0])
450 |             inputs = inputs.to("cuda")
451 |             generate_kwargs = dict(
452 |                 inputs=inputs,
453 |                 max_new_tokens=max_new_tokens,
454 |                 temperature=temperature,
455 |                 top_p=top_p,
456 |                 top_k=top_k,
457 |                 repetition_penalty=repetition_penalty,
458 |                 # num_beams=1,
459 |             )
460 |             generate_kwargs = (
461 |                 generate_kwargs if kwargs is None else {**generate_kwargs, **kwargs}
462 |             )
463 |             if stream:
464 |                 from transformers import TextIteratorStreamer
465 | 
466 |                 streamer = TextIteratorStreamer(
467 |                     self.tokenizer,
468 |                     timeout=10.0,
469 |                     skip_prompt=True,
470 |                     skip_special_tokens=True,
471 |                 )
472 |                 generate_kwargs["streamer"] = streamer
473 | 
474 |                 t = Thread(target=self.model.generate, kwargs=generate_kwargs)
475 |                 t.start()
476 | 
477 |                 def chunk_generator(chunks):
478 |                     for part in chunks:
479 |                         yield {
480 |                             "id": completion_id,
481 |                             "object": "text_completion",
482 |                             "created": created,
483 |                             "model": model_name,
484 |                             "choices": [
485 |                                 {
486 |                                     "text": part,
487 |                                     "index": 0,
488 |                                     "logprobs": None,
489 |                                     "finish_reason": None,
490 |                                 }
491 |                             ],
492 |                         }
493 | 
494 |                 chunks: Iterator[CompletionChunk] = chunk_generator(streamer)
495 |                 return chunks
496 | 
497 |             else:
498 |                 output_ids = self.model.generate(
499 |                     **generate_kwargs,
500 |                 )
501 |                 total_tokens_len = len(output_ids[0])
502 |                 output = self.tokenizer.decode(
503 |                     output_ids[0][prompt_tokens_len:], skip_special_tokens=True
504 |                 )
505 |                 completion: Completion = {
506 |                     "id": completion_id,
507 |                     "object": "text_completion",
508 |                     "created": created,
509 |                     "model": model_name,
510 |                     "choices": [
511 |                         {
512 |                             "text": output,
513 |                             "index": 0,
514 |                             "logprobs": None,
515 |                             "finish_reason": None,
516 |                         }
517 |                     ],
518 |                     "usage": {
519 |                         "prompt_tokens": prompt_tokens_len,
520 |                         "completion_tokens": total_tokens_len - prompt_tokens_len,
521 |                         "total_tokens": total_tokens_len,
522 |                     },
523 |                 }
524 |                 return completion
525 | 
526 |     def chat_completion(
527 |         self,
528 |         messages: List[Message],
529 |         stream: bool = False,
530 |         max_new_tokens: int = 1000,
531 |         temperature: float = 0.9,
532 |         top_p: float = 1.0,
533 |         top_k: int = 40,
534 |         repetition_penalty: float = 1.0,
535 |         **kwargs: Any,
536 |     ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
537 |         """For OpenAI compatible API /v1/chat/completions
538 |         Generate text from a dialog (chat history).
539 | 
540 |         Examples:
541 |             >>> llama2_wrapper = LLAMA2_WRAPPER()
542 |             >>> dialog = [
543 |                     {
544 |                         "role":"system",
545 |                         "content":"You are a helpful, respectful and honest assistant. "
546 |                     },{
547 |                         "role":"user",
548 |                         "content":"Hi do you know Pytorch?",
549 |                     },
550 |                 ]
551 |             >>> print(llm.chat_completion(dialog))
552 | 
553 |         Args:
554 |             dialog: The dialog (chat history) to generate text from.
555 |             stream: Whether to stream the results.
556 |             max_new_tokens: The maximum number of tokens to generate.
557 |             temperature: The temperature to use for sampling.
558 |             top_p: The top-p value to use for sampling.
559 |             top_k: The top-k value to use for sampling.
560 |             repetition_penalty: The penalty to apply to repeated tokens.
561 |             kwargs: all other arguments.
562 | 
563 |         Raises:
564 |             ValueError: If the requested tokens exceed the context window.
565 |             RuntimeError: If the prompt fails to tokenize or the model fails to evaluate the prompt.
566 | 
567 |         Returns:
568 |             Response object containing the generated text.
569 |         """
570 |         completion_id: str = f"cmpl-{str(uuid.uuid4())}"
571 |         created: int = int(time.time())
572 |         model_name: str = (
573 |             self.backend_type + " default model"
574 |             if self.model_path == ""
575 |             else self.model_path
576 |         )
577 |         if self.backend_type is BackendType.LLAMA_CPP:
578 |             completion_or_chunks = self.model.create_chat_completion(
579 |                 messages,
580 |                 stream=stream,
581 |                 max_tokens=max_new_tokens,
582 |                 temperature=temperature,
583 |                 top_p=top_p,
584 |                 top_k=top_k,
585 |                 repeat_penalty=repetition_penalty,
586 |                 **kwargs,
587 |             )
588 |             if stream:
589 |                 chunks: Iterator[ChatCompletionChunk] = completion_or_chunks
590 |                 return chunks
591 |             return completion_or_chunks
592 |         else:
593 |             prompt = get_prompt_for_dialog(messages)
594 |             inputs = self.tokenizer([prompt], return_tensors="pt").input_ids
595 |             prompt_tokens_len = len(inputs[0])
596 |             inputs = inputs.to("cuda")
597 |             generate_kwargs = dict(
598 |                 inputs=inputs,
599 |                 max_new_tokens=max_new_tokens,
600 |                 temperature=temperature,
601 |                 top_p=top_p,
602 |                 top_k=top_k,
603 |                 repetition_penalty=repetition_penalty,
604 |                 # num_beams=1,
605 |             )
606 |             generate_kwargs = (
607 |                 generate_kwargs if kwargs is None else {**generate_kwargs, **kwargs}
608 |             )
609 |             if stream:
610 |                 from transformers import TextIteratorStreamer
611 | 
612 |                 streamer = TextIteratorStreamer(
613 |                     self.tokenizer,
614 |                     timeout=10.0,
615 |                     skip_prompt=True,
616 |                     skip_special_tokens=True,
617 |                 )
618 |                 generate_kwargs["streamer"] = streamer
619 |                 t = Thread(target=self.model.generate, kwargs=generate_kwargs)
620 |                 t.start()
621 | 
622 |                 def chunk_generator(chunks):
623 |                     yield {
624 |                         "id": "chat" + completion_id,
625 |                         "model": model_name,
626 |                         "created": created,
627 |                         "object": "chat.completion.chunk",
628 |                         "choices": [
629 |                             {
630 |                                 "index": 0,
631 |                                 "delta": {
632 |                                     "role": "assistant",
633 |                                 },
634 |                                 "finish_reason": None,
635 |                             }
636 |                         ],
637 |                     }
638 |                     for part in enumerate(chunks):
639 |                         yield {
640 |                             "id": "chat" + completion_id,
641 |                             "model": model_name,
642 |                             "created": created,
643 |                             "object": "chat.completion.chunk",
644 |                             "choices": [
645 |                                 {
646 |                                     "index": 0,
647 |                                     "delta": {
648 |                                         "content": part,
649 |                                     },
650 |                                     "finish_reason": None,
651 |                                 }
652 |                             ],
653 |                         }
654 | 
655 |                 chunks: Iterator[ChatCompletionChunk] = chunk_generator(streamer)
656 |                 return chunks
657 | 
658 |             else:
659 |                 output_ids = self.model.generate(
660 |                     **generate_kwargs,
661 |                 )
662 |                 total_tokens_len = len(output_ids[0])
663 |                 output = self.tokenizer.decode(
664 |                     output_ids[0][prompt_tokens_len:], skip_special_tokens=True
665 |                 )
666 |                 chatcompletion: ChatCompletion = {
667 |                     "id": "chat" + completion_id,
668 |                     "object": "chat.completion",
669 |                     "created": created,
670 |                     "model": model_name,
671 |                     "choices": [
672 |                         {
673 |                             "index": 0,
674 |                             "message": {
675 |                                 "role": "assistant",
676 |                                 "content": output,
677 |                             },
678 |                             "finish_reason": None,
679 |                         }
680 |                     ],
681 |                     "usage": {
682 |                         "prompt_tokens": prompt_tokens_len,
683 |                         "completion_tokens": total_tokens_len - prompt_tokens_len,
684 |                         "total_tokens": total_tokens_len,
685 |                     },
686 |                 }
687 |                 return chatcompletion
688 | 
689 | 
690 | def get_prompt_for_dialog(dialog: List[Message]) -> str:
691 |     """Process dialog (chat history) to llama2 prompt for
692 |     OpenAI compatible API /v1/chat/completions.
693 | 
694 |     Examples:
695 |         >>> dialog = [
696 |                 {
697 |                     "role":"system",
698 |                     "content":"You are a helpful, respectful and honest assistant. "
699 |                 },{
700 |                     "role":"user",
701 |                     "content":"Hi do you know Pytorch?",
702 |                 },
703 |             ]
704 |         >>> prompt = get_prompt_for_dialog("Hi do you know Pytorch?")
705 | 
706 |     Args:
707 |         dialog: The dialog (chat history) to generate text from.
708 | 
709 |     Yields:
710 |         prompt string.
711 |     """
712 |     # add "<<SYS>>\n{system_prompt}\n<</SYS>>\n\n" in first dialog
713 |     if dialog[0]["role"] == "system":
714 |         dialog = [
715 |             {
716 |                 "role": dialog[1]["role"],
717 |                 "content": B_SYS + dialog[0]["content"] + E_SYS + dialog[1]["content"],
718 |             }
719 |         ] + dialog[2:]
720 |     # check roles
721 |     assert all([msg["role"] == "user" for msg in dialog[::2]]) and all(
722 |         [msg["role"] == "assistant" for msg in dialog[1::2]]
723 |     ), (
724 |         "model only supports 'system', 'user' and 'assistant' roles, "
725 |         "starting with 'system', then 'user' and alternating (u/a/u/a/u...)"
726 |     )
727 |     # add chat history
728 |     texts = []
729 |     for prompt, answer in zip(
730 |         dialog[::2],
731 |         dialog[1::2],
732 |     ):
733 |         texts.append(
734 |             f"{B_INST} {(prompt['content']).strip()} {E_INST} {(answer['content']).strip()} "
735 |         )
736 |     # check last message if role is user, then add it to prompt text
737 |     assert (
738 |         dialog[-1]["role"] == "user"
739 |     ), f"Last message must be from user, got {dialog[-1]['role']}"
740 |     texts.append(f"{B_INST} {(dialog[-1]['content']).strip()} {E_INST}")
741 |     return "".join(texts)
742 | 
743 | 
744 | def get_prompt(
745 |     message: str, chat_history: list[tuple[str, str]] = [], system_prompt: str = ""
746 | ) -> str:
747 |     """Process message to llama2 prompt with chat history
748 |     and system_prompt for chatbot.
749 | 
750 |     Examples:
751 |         >>> prompt = get_prompt("Hi do you know Pytorch?")
752 | 
753 |     Args:
754 |         message: The origianl chat message to generate text from.
755 |         chat_history: Chat history list from chatbot.
756 |         system_prompt: System prompt for chatbot.
757 | 
758 |     Yields:
759 |         prompt string.
760 |     """
761 |     texts = [f"[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\n"]
762 |     for user_input, response in chat_history:
763 |         texts.append(f"{user_input.strip()} [/INST] {response.strip()} </s><s> [INST] ")
764 |     texts.append(f"{message.strip()} [/INST]")
765 |     return "".join(texts)
766 | 
767 | 
768 | class BackendType(Enum):
769 |     UNKNOWN = 0
770 |     TRANSFORMERS = 1
771 |     GPTQ = 2
772 |     LLAMA_CPP = 3
773 | 
774 |     @classmethod
775 |     def get_type(cls, backend_name: str):
776 |         backend_type = None
777 |         backend_name_lower = backend_name.lower()
778 |         if "transformers" in backend_name_lower:
779 |             backend_type = BackendType.TRANSFORMERS
780 |         elif "gptq" in backend_name_lower:
781 |             backend_type = BackendType.GPTQ
782 |         elif "cpp" in backend_name_lower:
783 |             backend_type = BackendType.LLAMA_CPP
784 |         else:
785 |             raise Exception("Unknown backend: " + backend_name)
786 |             # backend_type = BackendType.UNKNOWN
787 |         return backend_type
788 | 


--------------------------------------------------------------------------------