├── .gitignore
├── README.md
├── llama3.1-70b-instruct-awq
    ├── .bentoignore
    ├── README.md
    ├── bentofile.yaml
    ├── import_model.py
    ├── openai_endpoints.py
    ├── requirements.txt
    └── service.py
├── llama3.1-8b-instruct
    ├── .bentoignore
    ├── README.md
    ├── requirements.txt
    └── service.py
└── mistral-7b-instruct
    ├── .bentoignore
    ├── README.md
    ├── bentofile.yaml
    ├── requirements.txt
    └── service.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | *.whl
162 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <div align="center">
  2 |     <h1 align="center">Self-host LLMs with LMDeploy and BentoML</h1>
  3 | </div>
  4 | 
  5 | This is a BentoML example project, showing you how to serve and deploy open-source Large Language Models (LLMs) using [LMDeploy](https://github.com/InternLM/lmdeploy), a toolkit for compressing, deploying, and serving LLMs.
  6 | 
  7 | See [here](https://docs.bentoml.com/en/latest/examples/overview.html) for a full list of BentoML example projects.
  8 | 
  9 | 💡 This example is served as a basis for advanced code customization, such as custom model, inference logic or LMDeploy options. For simple LLM hosting with OpenAI compatible endpoint without writing any code, see [OpenLLM](https://github.com/bentoml/OpenLLM).
 10 | 
 11 | ## Prerequisites
 12 | 
 13 | - You have installed Python 3.8+ and `pip`. See the [Python downloads page](https://www.python.org/downloads/) to learn more.
 14 | - You have a basic understanding of key concepts in BentoML, such as Services. We recommend you read [Quickstart](https://docs.bentoml.com/en/1.2/get-started/quickstart.html) first.
 15 | - If you want to test the Service locally, you need a Nvidia GPU with at least 20G VRAM.
 16 | - This example uses Llama 3 8B Instruct. Make sure you have [gained access to the model](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct).
 17 | - (Optional) We recommend you create a virtual environment for dependency isolation for this project. See the [Conda documentation](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html) or the [Python documentation](https://docs.python.org/3/library/venv.html) for details.
 18 | 
 19 | ## Install dependencies
 20 | 
 21 | ```bash
 22 | git clone https://github.com/bentoml/BentoLMDeploy.git
 23 | cd BentoLMDeploy/llama3.1-8b-instruct
 24 | pip install -r requirements.txt
 25 | ```
 26 | 
 27 | ## Download the model
 28 | 
 29 | Run the script to download Llama 3.
 30 | 
 31 | ```bash
 32 | python import_model.py
 33 | ```
 34 | 
 35 | ## Run the BentoML Service
 36 | 
 37 | We have defined a BentoML Service in `service.py`. Run `bentoml serve` in your project directory to start the Service.
 38 | 
 39 | ```bash
 40 | $ bentoml serve .
 41 | 
 42 | 2024-05-04T17:24:01+0800 [INFO] [cli] Starting production HTTP BentoServer from "service:LMDeploy" listening on http://localhost:3000 (Press CTRL+C to quit)
 43 | 2024-05-04 17:24:03,239 - lmdeploy - INFO - input backend=turbomind, backend_config=TurbomindEngineConfig(model_name='meta-llama/Meta-Llama-3-8B-Instruct', model_format='hf', tp=1, session_len=None, max_batch_size=128, cach
 44 | e_max_entry_count=0.9, cache_block_seq_len=64, quant_policy=0, rope_scaling_factor=0.0, use_logn_attn=False, download_dir=None, revision=None, max_prefill_token_num=8192, num_tokens_per_iter=0, max_prefill_iters=1)
 45 | 2024-05-04 17:24:03,240 - lmdeploy - INFO - input chat_template_config=None
 46 | 2024-05-04 17:24:03,339 - lmdeploy - INFO - updated chat_template_onfig=ChatTemplateConfig(model_name='llama3', system=None, meta_instruction=None, eosys=None, user=None, eoh=None, assistant=None, eoa=None, separator=None,
 47 | capability=None, stop_words=None)
 48 | 2024-05-04 17:24:03,359 - lmdeploy - WARNING - model_source: hf_model
 49 | 2024-05-04 17:24:03,359 - lmdeploy - WARNING - model_name is deprecated in TurbomindEngineConfig and has no effect
 50 | Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
 51 | Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
 52 | 2024-05-04 17:24:03,727 - lmdeploy - WARNING - model_config:
 53 | 
 54 | ...
 55 | ```
 56 | 
 57 | The server is now active at [http://localhost:3000](http://localhost:3000/). You can interact with it using the Swagger UI or in other different ways.
 58 | 
 59 | <details>
 60 | 
 61 | <summary>CURL</summary>
 62 | 
 63 | ```bash
 64 | curl -X 'POST' \
 65 |   'http://localhost:3000/generate' \
 66 |   -H 'accept: text/event-stream' \
 67 |   -H 'Content-Type: application/json' \
 68 |   -d '{
 69 |   "prompt": "Explain superconductors like I'\''m five years old",
 70 |   "max_tokens": 1024
 71 | }'
 72 | ```
 73 | 
 74 | </details>
 75 | 
 76 | <details>
 77 | 
 78 | <summary>Python client</summary>
 79 | 
 80 | ```python
 81 | import bentoml
 82 | 
 83 | with bentoml.SyncHTTPClient("http://localhost:3000") as client:
 84 |     response_generator = client.generate(
 85 |         prompt="Explain superconductors like I'm five years old",
 86 |         max_tokens=1024
 87 |     )
 88 |     for response in response_generator:
 89 |         print(response, end='')
 90 | ```
 91 | 
 92 | </details>
 93 | 
 94 | ## Deploy to BentoCloud
 95 | 
 96 | After the Service is ready, you can deploy the application to BentoCloud for better management and scalability. [Sign up](https://www.bentoml.com/) if you haven't got a BentoCloud account.
 97 | 
 98 | Make sure you have [logged in to BentoCloud](https://docs.bentoml.com/en/latest/bentocloud/how-tos/manage-access-token.html), then run the following command to deploy it.
 99 | 
100 | ```bash
101 | bentoml deploy .
102 | ```
103 | 
104 | Once the application is up and running on BentoCloud, you can access it via the exposed URL.
105 | 
106 | **Note**: For custom deployment in your own infrastructure, use [BentoML to generate an OCI-compliant image](https://docs.bentoml.com/en/latest/guides/containerization.html).
107 | 


--------------------------------------------------------------------------------
/llama3.1-70b-instruct-awq/.bentoignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | *.py[cod]
3 | *$py.class
4 | .ipynb_checkpoints
5 | venv/
6 | 


--------------------------------------------------------------------------------
/llama3.1-70b-instruct-awq/README.md:
--------------------------------------------------------------------------------
  1 | <div align="center">
  2 |     <h1 align="center">Self-host LLMs with LMDeploy and BentoML</h1>
  3 | </div>
  4 | 
  5 | This is a BentoML example project, showing you how to serve and deploy open-source Large Language Models using [LMDeploy](https://github.com/InternLM/lmdeploy), a toolkit for compressing, deploying, and serving LLM.
  6 | 
  7 | See [here](https://github.com/bentoml/BentoML?tab=readme-ov-file#%EF%B8%8F-what-you-can-build-with-bentoml) for a full list of BentoML example projects.
  8 | 
  9 | 💡 This example is served as a basis for advanced code customization, such as custom model, inference logic or LMDeploy options. For simple LLM hosting with OpenAI compatible endpoint without writing any code, see [OpenLLM](https://github.com/bentoml/OpenLLM).
 10 | 
 11 | 
 12 | ## Prerequisites
 13 | 
 14 | - You have installed Python 3.8+ and `pip`. See the [Python downloads page](https://www.python.org/downloads/) to learn more.
 15 | - You have a basic understanding of key concepts in BentoML, such as Services. We recommend you read [Quickstart](https://docs.bentoml.com/en/1.2/get-started/quickstart.html) first.
 16 | - If you want to test the Service locally, you need a Nvidia GPU with at least 20G VRAM.
 17 | - (Optional) We recommend you create a virtual environment for dependency isolation for this project. See the [Conda documentation](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html) or the [Python documentation](https://docs.python.org/3/library/venv.html) for details.
 18 | 
 19 | ## Install dependencies
 20 | 
 21 | ```bash
 22 | git clone https://github.com/bentoml/BentoLMDeploy.git
 23 | pip install -r requirements.txt
 24 | ```
 25 | 
 26 | ## Download the model
 27 | 
 28 | ```bash
 29 | python import_model.py
 30 | ```
 31 | 
 32 | ## Run the BentoML Service
 33 | 
 34 | We have defined a BentoML Service in `service.py`. Run `bentoml serve` in your project directory to start the Service.
 35 | 
 36 | ```bash
 37 | $ bentoml serve .
 38 | 
 39 | 2024-05-04T17:24:01+0800 [INFO] [cli] Starting production HTTP BentoServer from "service:LMDeploy" listening on http://localhost:3000 (Press CTRL+C to quit)
 40 | 2024-05-04 17:24:03,239 - lmdeploy - INFO - input backend=turbomind, backend_config=TurbomindEngineConfig(model_name='meta-llama/Meta-Llama-3-8B-Instruct', model_format='hf', tp=1, session_len=None, max_batch_size=128, cach
 41 | e_max_entry_count=0.9, cache_block_seq_len=64, quant_policy=0, rope_scaling_factor=0.0, use_logn_attn=False, download_dir=None, revision=None, max_prefill_token_num=8192, num_tokens_per_iter=0, max_prefill_iters=1)
 42 | 2024-05-04 17:24:03,240 - lmdeploy - INFO - input chat_template_config=None
 43 | 2024-05-04 17:24:03,339 - lmdeploy - INFO - updated chat_template_onfig=ChatTemplateConfig(model_name='llama3', system=None, meta_instruction=None, eosys=None, user=None, eoh=None, assistant=None, eoa=None, separator=None,
 44 | capability=None, stop_words=None)
 45 | 2024-05-04 17:24:03,359 - lmdeploy - WARNING - model_source: hf_model
 46 | 2024-05-04 17:24:03,359 - lmdeploy - WARNING - model_name is deprecated in TurbomindEngineConfig and has no effect
 47 | Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
 48 | Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
 49 | 2024-05-04 17:24:03,727 - lmdeploy - WARNING - model_config:
 50 | 
 51 | ...
 52 | ```
 53 | 
 54 | The server is now active at [http://localhost:3000](http://localhost:3000/). You can interact with it using the Swagger UI or in other different ways.
 55 | 
 56 | <details>
 57 | 
 58 | <summary>CURL</summary>
 59 | 
 60 | ```bash
 61 | curl -X 'POST' \
 62 |   'http://localhost:3000/generate' \
 63 |   -H 'accept: text/event-stream' \
 64 |   -H 'Content-Type: application/json' \
 65 |   -d '{
 66 |   "prompt": "Explain superconductors like I'\''m five years old",
 67 | }'
 68 | ```
 69 | 
 70 | </details>
 71 | 
 72 | <details>
 73 | 
 74 | <summary>Python client</summary>
 75 | 
 76 | ```python
 77 | import bentoml
 78 | 
 79 | with bentoml.SyncHTTPClient("http://localhost:3000") as client:
 80 |     response_generator = client.generate(
 81 |         prompt="Explain superconductors like I'm five years old",
 82 |     )
 83 |     for response in response_generator:
 84 |         print(response)
 85 | ```
 86 | 
 87 | </details>
 88 | 
 89 | 
 90 | ## Deploy to BentoCloud
 91 | 
 92 | After the Service is ready, you can deploy the application to BentoCloud for better management and scalability. [Sign up](https://www.bentoml.com/) if you haven't got a BentoCloud account.
 93 | 
 94 | Make sure you have [logged in to BentoCloud](https://docs.bentoml.com/en/latest/bentocloud/how-tos/manage-access-token.html), then run the following command to deploy it.
 95 | 
 96 | ```bash
 97 | bentoml deploy .
 98 | ```
 99 | 
100 | Once the application is up and running on BentoCloud, you can access it via the exposed URL.
101 | 
102 | **Note**: For custom deployment in your own infrastructure, use [BentoML to generate an OCI-compliant image](https://docs.bentoml.com/en/latest/guides/containerization.html).
103 | 
104 | 
105 | ## Different LLM Models
106 | 
107 | Besides the mistral-7b-instruct model, we have examples for other models in subdirectories of this repository. Below is a list of these models and links to the example subdirectories.
108 | 
109 | - [Llama-2-7b-chat-hf](llama2-7b-chat/)
110 | - [Llama-3-8b-instruct](llama3-8b-instruct/)
111 | - [Mistral-7B-Instruct-v0.2](mistral-7b-instruct/)
112 | - [Mixtral-8x7B-Instruct-v0.1 with gptq quantization](mistral-7b-instruct/)
113 | - [Outlines integration](outlines-integration/)
114 | - [SOLAR-10.7B-v1.0](solar-10.7b-instruct/)
115 | 
116 | 
117 | ## LLM tools integration examples
118 | 
119 | - Every model directory contains codes to add OpenAI compatible endpoints to the BentoML service.
120 | - [outlines-integration/](outlines-integration/) contains the code to integrate with [outlines](https://github.com/outlines-dev/outlines) for structured generation.
121 | 


--------------------------------------------------------------------------------
/llama3.1-70b-instruct-awq/bentofile.yaml:
--------------------------------------------------------------------------------
 1 | service: "service:LMDeploy"
 2 | labels:
 3 |   owner: bentoml-team
 4 |   stage: demo
 5 | include:
 6 | - "*.py"
 7 | python:
 8 |   requirements_txt: "./requirements.txt"
 9 |   lock_packages: false
10 | 


--------------------------------------------------------------------------------
/llama3.1-70b-instruct-awq/import_model.py:
--------------------------------------------------------------------------------
 1 | import bentoml
 2 | 
 3 | MODEL_ID = "hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4"
 4 | BENTO_MODEL_TAG = MODEL_ID.lower().replace("/", "--")
 5 | 
 6 | def import_model(model_id, bento_model_tag):
 7 | 
 8 |     import torch
 9 |     from transformers import AutoModelForCausalLM, AutoTokenizer
10 | 
11 |     tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
12 |     model = AutoModelForCausalLM.from_pretrained(
13 |         MODEL_ID,
14 |         torch_dtype=torch.float16,
15 |         low_cpu_mem_usage=True,
16 |     )
17 | 
18 |     with bentoml.models.create(bento_model_tag) as bento_model_ref:
19 |         tokenizer.save_pretrained(bento_model_ref.path)
20 |         model.save_pretrained(bento_model_ref.path)
21 | 
22 | 
23 | if __name__ == "__main__":
24 |     import_model(MODEL_ID, BENTO_MODEL_TAG)    
25 | 


--------------------------------------------------------------------------------
/llama3.1-70b-instruct-awq/openai_endpoints.py:
--------------------------------------------------------------------------------
 1 | import fastapi
 2 | import lmdeploy.serve.openai.api_server as lmdeploy_api_server
 3 | 
 4 | openai_api_app = fastapi.FastAPI()
 5 | 
 6 | OPENAI_ENDPOINTS = [
 7 |     ["/chat/completions", lmdeploy_api_server.chat_completions_v1, ["POST"]],
 8 |     ["/completions", lmdeploy_api_server.completions_v1, ["POST"]],
 9 |     ["/models", lmdeploy_api_server.available_models, ["GET"]],
10 | ]
11 | 
12 | for route, endpoint, methods in OPENAI_ENDPOINTS:
13 |     openai_api_app.add_api_route(
14 |         path=route,
15 |         endpoint=endpoint,
16 |         methods=methods,
17 |         include_in_schema=True,
18 |     )
19 | 


--------------------------------------------------------------------------------
/llama3.1-70b-instruct-awq/requirements.txt:
--------------------------------------------------------------------------------
1 | autoawq==0.2.5
2 | bentoml>=1.3.0
3 | fastapi==0.111.1
4 | lmdeploy==0.5.2.post1
5 | 


--------------------------------------------------------------------------------
/llama3.1-70b-instruct-awq/service.py:
--------------------------------------------------------------------------------
  1 | import uuid
  2 | from typing import AsyncGenerator, Optional
  3 | 
  4 | import bentoml
  5 | from annotated_types import Ge, Le
  6 | from typing_extensions import Annotated
  7 | 
  8 | from openai_endpoints import openai_api_app
  9 | 
 10 | 
 11 | MAX_TOKENS = 1024
 12 | SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
 13 | 
 14 | If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""
 15 | 
 16 | PROMPT_TEMPLATE = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
 17 | 
 18 | {system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>
 19 | 
 20 | {user_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
 21 | 
 22 | """
 23 | 
 24 | MODEL_ID = "hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4"
 25 | BENTO_MODEL_TAG = MODEL_ID.lower().replace("/", "--")
 26 | 
 27 | 
 28 | @bentoml.mount_asgi_app(openai_api_app, path="/v1")
 29 | @bentoml.service(
 30 |     name="bentolmdeploy-llama3.1-70b-instruct-awq-service",
 31 |     traffic={
 32 |         "timeout": 300,
 33 |     },
 34 |     resources={
 35 |         "gpu": 1,
 36 |         "gpu_type": "nvidia-a100-80gb",
 37 |     },
 38 | )
 39 | class LMDeploy:
 40 | 
 41 |     bento_model_ref = bentoml.models.get(BENTO_MODEL_TAG)
 42 | 
 43 |     def __init__(self) -> None:
 44 |         from transformers import AutoTokenizer
 45 |         from lmdeploy.serve.async_engine import AsyncEngine
 46 |         from lmdeploy.messages import TurbomindEngineConfig
 47 | 
 48 |         engine_config = TurbomindEngineConfig(
 49 |             model_name=MODEL_ID,
 50 |             model_format="awq",
 51 |             cache_max_entry_count=0.85,
 52 |             enable_prefix_caching=True,
 53 |         )
 54 |         self.engine = AsyncEngine(
 55 |             self.bento_model_ref.path, backend_config=engine_config
 56 |         )
 57 | 
 58 |         import lmdeploy.serve.openai.api_server as lmdeploy_api_sever
 59 |         lmdeploy_api_sever.VariableInterface.async_engine = self.engine
 60 | 
 61 |         tokenizer = AutoTokenizer.from_pretrained(self.bento_model_ref.path)
 62 |         self.stop_tokens = [
 63 |             tokenizer.convert_ids_to_tokens(
 64 |                 tokenizer.eos_token_id,
 65 |             ),
 66 |             "<|eot_id|>",
 67 |         ]
 68 | 
 69 | 
 70 |     @bentoml.api
 71 |     async def generate(
 72 |         self,
 73 |         ctx: bentoml.Context,
 74 |         prompt: str = "Explain superconductors in plain English",
 75 |         system_prompt: Optional[str] = SYSTEM_PROMPT,
 76 |         max_tokens: Annotated[int, Ge(128), Le(MAX_TOKENS)] = MAX_TOKENS,
 77 |     ) -> AsyncGenerator[str, None]:
 78 | 
 79 |         from lmdeploy import GenerationConfig
 80 | 
 81 |         gen_config = GenerationConfig(
 82 |             max_new_tokens=max_tokens, stop_words=self.stop_tokens,
 83 |         )
 84 | 
 85 |         if system_prompt is None:
 86 |             system_prompt = SYSTEM_PROMPT
 87 |         prompt = PROMPT_TEMPLATE.format(user_prompt=prompt, system_prompt=system_prompt)
 88 | 
 89 |         session_id = abs(uuid.uuid4().int >> 96)
 90 |         stream = self.engine.generate(
 91 |             prompt, session_id=session_id, gen_config=gen_config
 92 |         )
 93 | 
 94 |         async for request_output in stream:
 95 |             if await ctx.request.is_disconnected():
 96 |                 await self.engine.stop_session(session_id)
 97 |                 await self.engine.end_session(session_id)
 98 |                 return
 99 |             yield request_output.response
100 | 


--------------------------------------------------------------------------------
/llama3.1-8b-instruct/.bentoignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | *.py[cod]
3 | *$py.class
4 | .ipynb_checkpoints
5 | venv/
6 | 


--------------------------------------------------------------------------------
/llama3.1-8b-instruct/README.md:
--------------------------------------------------------------------------------
  1 | <div align="center">
  2 |     <h1 align="center">Self-host LLMs with LMDeploy and BentoML</h1>
  3 | </div>
  4 | 
  5 | This is a BentoML example project, showing you how to serve and deploy open-source Large Language Models (LLMs) using [LMDeploy](https://github.com/InternLM/lmdeploy), a toolkit for compressing, deploying, and serving LLMs.
  6 | 
  7 | See [here](https://docs.bentoml.com/en/latest/examples/overview.html) for a full list of BentoML example projects.
  8 | 
  9 | 💡 This example is served as a basis for advanced code customization, such as custom model, inference logic or LMDeploy options. For simple LLM hosting with OpenAI compatible endpoint without writing any code, see [OpenLLM](https://github.com/bentoml/OpenLLM).
 10 | 
 11 | ## Prerequisites
 12 | 
 13 | - You have installed Python 3.8+ and `pip`. See the [Python downloads page](https://www.python.org/downloads/) to learn more.
 14 | - You have a basic understanding of key concepts in BentoML, such as Services. We recommend you read [Quickstart](https://docs.bentoml.com/en/1.2/get-started/quickstart.html) first.
 15 | - If you want to test the Service locally, you need a Nvidia GPU with at least 20G VRAM.
 16 | - This example uses Llama 3 8B Instruct. Make sure you have [gained access to the model](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct).
 17 | - (Optional) We recommend you create a virtual environment for dependency isolation for this project. See the [Conda documentation](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html) or the [Python documentation](https://docs.python.org/3/library/venv.html) for details.
 18 | 
 19 | ## Install dependencies
 20 | 
 21 | ```bash
 22 | git clone https://github.com/bentoml/BentoLMDeploy.git
 23 | cd BentoLMDeploy/llama3-8b-instruct
 24 | pip install -r requirements.txt
 25 | ```
 26 | 
 27 | ## Download the model
 28 | 
 29 | Run the script to download Llama 3.
 30 | 
 31 | ```bash
 32 | python import_model.py
 33 | ```
 34 | 
 35 | ## Run the BentoML Service
 36 | 
 37 | We have defined a BentoML Service in `service.py`. Run `bentoml serve` in your project directory to start the Service.
 38 | 
 39 | ```bash
 40 | $ bentoml serve .
 41 | 
 42 | 2024-05-04T17:24:01+0800 [INFO] [cli] Starting production HTTP BentoServer from "service:LMDeploy" listening on http://localhost:3000 (Press CTRL+C to quit)
 43 | 2024-05-04 17:24:03,239 - lmdeploy - INFO - input backend=turbomind, backend_config=TurbomindEngineConfig(model_name='meta-llama/Meta-Llama-3-8B-Instruct', model_format='hf', tp=1, session_len=None, max_batch_size=128, cach
 44 | e_max_entry_count=0.9, cache_block_seq_len=64, quant_policy=0, rope_scaling_factor=0.0, use_logn_attn=False, download_dir=None, revision=None, max_prefill_token_num=8192, num_tokens_per_iter=0, max_prefill_iters=1)
 45 | 2024-05-04 17:24:03,240 - lmdeploy - INFO - input chat_template_config=None
 46 | 2024-05-04 17:24:03,339 - lmdeploy - INFO - updated chat_template_onfig=ChatTemplateConfig(model_name='llama3', system=None, meta_instruction=None, eosys=None, user=None, eoh=None, assistant=None, eoa=None, separator=None,
 47 | capability=None, stop_words=None)
 48 | 2024-05-04 17:24:03,359 - lmdeploy - WARNING - model_source: hf_model
 49 | 2024-05-04 17:24:03,359 - lmdeploy - WARNING - model_name is deprecated in TurbomindEngineConfig and has no effect
 50 | Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
 51 | Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
 52 | 2024-05-04 17:24:03,727 - lmdeploy - WARNING - model_config:
 53 | 
 54 | ...
 55 | ```
 56 | 
 57 | The server is now active at [http://localhost:3000](http://localhost:3000/). You can interact with it using the Swagger UI or in other different ways.
 58 | 
 59 | <details>
 60 | 
 61 | <summary>CURL</summary>
 62 | 
 63 | ```bash
 64 | curl -X 'POST' \
 65 |   'http://localhost:3000/generate' \
 66 |   -H 'accept: text/event-stream' \
 67 |   -H 'Content-Type: application/json' \
 68 |   -d '{
 69 |   "prompt": "Explain superconductors like I'\''m five years old",
 70 |   "max_tokens": 1024
 71 | }'
 72 | ```
 73 | 
 74 | </details>
 75 | 
 76 | <details>
 77 | 
 78 | <summary>Python client</summary>
 79 | 
 80 | ```python
 81 | import bentoml
 82 | 
 83 | with bentoml.SyncHTTPClient("http://localhost:3000") as client:
 84 |     response_generator = client.generate(
 85 |         prompt="Explain superconductors like I'm five years old",
 86 |         max_tokens=1024
 87 |     )
 88 |     for response in response_generator:
 89 |         print(response, end='')
 90 | ```
 91 | 
 92 | </details>
 93 | 
 94 | ## Deploy to BentoCloud
 95 | 
 96 | After the Service is ready, you can deploy the application to BentoCloud for better management and scalability. [Sign up](https://www.bentoml.com/) if you haven't got a BentoCloud account.
 97 | 
 98 | Make sure you have [logged in to BentoCloud](https://docs.bentoml.com/en/latest/bentocloud/how-tos/manage-access-token.html), then run the following command to deploy it.
 99 | 
100 | ```bash
101 | bentoml deploy .
102 | ```
103 | 
104 | Once the application is up and running on BentoCloud, you can access it via the exposed URL.
105 | 
106 | **Note**: For custom deployment in your own infrastructure, use [BentoML to generate an OCI-compliant image](https://docs.bentoml.com/en/latest/guides/containerization.html).
107 | 


--------------------------------------------------------------------------------
/llama3.1-8b-instruct/requirements.txt:
--------------------------------------------------------------------------------
1 | bentoml>=1.4.3
2 | fastapi==0.115.6
3 | lmdeploy==0.7.1
4 | 


--------------------------------------------------------------------------------
/llama3.1-8b-instruct/service.py:
--------------------------------------------------------------------------------
  1 | import uuid
  2 | from typing import AsyncGenerator, Optional
  3 | 
  4 | import bentoml
  5 | import fastapi
  6 | from annotated_types import Ge, Le
  7 | from typing_extensions import Annotated
  8 | 
  9 | openai_api_app = fastapi.FastAPI()
 10 | 
 11 | 
 12 | MAX_SESSION_LEN = 2048
 13 | MAX_TOKENS = 1024
 14 | SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
 15 | 
 16 | If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""
 17 | 
 18 | MODEL_ID = "meta-llama/Meta-Llama-3.1-8B-Instruct"
 19 | 
 20 | 
 21 | @bentoml.asgi_app(openai_api_app, path="/v1")
 22 | @bentoml.service(
 23 |     name="bentolmdeploy-llama3.1-8b-insruct-service",
 24 |     image=bentoml.images.PythonImage(python_version="3.11").requirements_file("requirements.txt"),
 25 |     traffic={
 26 |         "timeout": 300,
 27 |     },
 28 |     resources={
 29 |         "gpu": 1,
 30 |         "gpu_type": "nvidia-l4",
 31 |     },
 32 | )
 33 | class LMDeploy:
 34 |     hf_model = bentoml.models.HuggingFaceModel(MODEL_ID)
 35 | 
 36 |     def __init__(self) -> None:
 37 |         from transformers import AutoTokenizer
 38 |         from lmdeploy import ChatTemplateConfig
 39 |         from lmdeploy.serve.async_engine import AsyncEngine
 40 |         from lmdeploy.messages import TurbomindEngineConfig
 41 | 
 42 |         engine_config = TurbomindEngineConfig(
 43 |             model_name=MODEL_ID,
 44 |             model_format="hf",
 45 |             cache_max_entry_count=0.9,
 46 |             enable_prefix_caching=True,
 47 |             session_len=MAX_SESSION_LEN,
 48 |         )
 49 |         self.engine = AsyncEngine(
 50 |             self.hf_model,
 51 |             backend_config=engine_config,
 52 |             model_name=MODEL_ID,
 53 |             chat_template_config=ChatTemplateConfig("llama3_1"),
 54 |         )
 55 | 
 56 |         self.tokenizer = AutoTokenizer.from_pretrained(self.hf_model)
 57 |         self.stop_tokens = [
 58 |             self.tokenizer.convert_ids_to_tokens(
 59 |                 self.tokenizer.eos_token_id,
 60 |             ),
 61 |             "<|eot_id|>",
 62 |         ]
 63 | 
 64 |         import lmdeploy.serve.openai.api_server as lmdeploy_api_server
 65 |         lmdeploy_api_server.VariableInterface.async_engine = self.engine
 66 | 
 67 |         OPENAI_ENDPOINTS = [
 68 |             ["/chat/completions", lmdeploy_api_server.chat_completions_v1, ["POST"]],
 69 |             ["/completions", lmdeploy_api_server.completions_v1, ["POST"]],
 70 |             ["/models", lmdeploy_api_server.available_models, ["GET"]],
 71 |         ]
 72 | 
 73 |         for route, endpoint, methods in OPENAI_ENDPOINTS:
 74 |             openai_api_app.add_api_route(
 75 |                 path=route,
 76 |                 endpoint=endpoint,
 77 |                 methods=methods,
 78 |                 include_in_schema=True,
 79 |             )
 80 | 
 81 | 
 82 |     @bentoml.api
 83 |     async def generate(
 84 |         self,
 85 |         ctx: bentoml.Context,
 86 |         prompt: str = "Explain superconductors in plain English",
 87 |         system_prompt: Optional[str] = SYSTEM_PROMPT,
 88 |         max_tokens: Annotated[int, Ge(128), Le(MAX_TOKENS)] = MAX_TOKENS,
 89 |     ) -> AsyncGenerator[str, None]:
 90 | 
 91 |         from lmdeploy import GenerationConfig
 92 | 
 93 |         gen_config = GenerationConfig(
 94 |             max_new_tokens=max_tokens, stop_words=self.stop_tokens,
 95 |         )
 96 | 
 97 |         if system_prompt is None:
 98 |             system_prompt = SYSTEM_PROMPT
 99 | 
100 |         messages = [
101 |             dict(role="system", content=system_prompt),
102 |             dict(role="user", content=prompt),
103 |         ]
104 | 
105 |         prompt = self.tokenizer.apply_chat_template(
106 |             conversation=messages,
107 |             add_generation_prompt=True,
108 |             tokenize=False,
109 |         )
110 | 
111 |         session_id = abs(uuid.uuid4().int >> 96)
112 |         stream = self.engine.generate(
113 |             prompt, session_id=session_id, gen_config=gen_config
114 |         )
115 | 
116 |         async for request_output in stream:
117 |             if await ctx.request.is_disconnected():
118 |                 await self.engine.stop_session(session_id)
119 |                 await self.engine.end_session(session_id)
120 |                 return
121 |             yield request_output.response
122 | 


--------------------------------------------------------------------------------
/mistral-7b-instruct/.bentoignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | *.py[cod]
3 | *$py.class
4 | .ipynb_checkpoints
5 | venv/
6 | 


--------------------------------------------------------------------------------
/mistral-7b-instruct/README.md:
--------------------------------------------------------------------------------
 1 | <div align="center">
 2 |     <h1 align="center">Self-host LLMs with LMDeploy and BentoML</h1>
 3 | </div>
 4 | 
 5 | This is a BentoML example project, showing you how to serve and deploy open-source Large Language Models (LLMs) using [LMDeploy](https://github.com/InternLM/lmdeploy), a toolkit for compressing, deploying, and serving LLMs.
 6 | 
 7 | See [here](https://docs.bentoml.com/en/latest/examples/overview.html) for a full list of BentoML example projects.
 8 | 
 9 | 💡 This example is served as a basis for advanced code customization, such as custom model, inference logic or LMDeploy options. For simple LLM hosting with OpenAI compatible endpoint without writing any code, see [OpenLLM](https://github.com/bentoml/OpenLLM).
10 | 
11 | ## Prerequisites
12 | 
13 | - You have installed Python 3.8+ and `pip`. See the [Python downloads page](https://www.python.org/downloads/) to learn more.
14 | - You have a basic understanding of key concepts in BentoML, such as Services. We recommend you read [Quickstart](https://docs.bentoml.com/en/1.2/get-started/quickstart.html) first.
15 | - If you want to test the Service locally, you need a Nvidia GPU with at least 24G VRAM.
16 | - This example uses Mistral 7B Instruct. Make sure you have [gained access to the model](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2).
17 | - (Optional) We recommend you create a virtual environment for dependency isolation for this project. See the [Conda documentation](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html) or the [Python documentation](https://docs.python.org/3/library/venv.html) for details.
18 | 
19 | ## Install dependencies
20 | 
21 | ```bash
22 | git clone https://github.com/bentoml/BentoLMDeploy.git
23 | cd BentoLMDeploy/mistral-7b-instruct
24 | pip install -r requirements.txt
25 | ```
26 | 
27 | ## Run the BentoML Service
28 | 
29 | We have defined a BentoML Service in `service.py`. Run `bentoml serve` in your project directory to start the Service.
30 | 
31 | ```bash
32 | $ bentoml serve .
33 | 
34 | 2024-07-05T23:57:36+0800 [INFO] [cli] Starting production HTTP BentoServer from "service:LMDeploy" listening on http://localhost:3000 (Press CTRL+C to quit)                                     
35 | 2024-07-05 23:57:38,582 - lmdeploy - INFO - input backend=turbomind, backend_config=TurbomindEngineConfig(model_name='mistralai/Mistral-7B-Instruct-v0.2', model_format='hf', tp=1, session_len=N
36 | one, max_batch_size=128, cache_max_entry_count=0.95, cache_block_seq_len=64, enable_prefix_caching=False, quant_policy=0, rope_scaling_factor=0.0, use_logn_attn=False, download_dir=None, revisi
37 | on=None, max_prefill_token_num=8192, num_tokens_per_iter=0, max_prefill_iters=1)                                                                                                                 
38 | 2024-07-05 23:57:38,582 - lmdeploy - INFO - input chat_template_config=None                                                                                                                      
39 | 2024-07-05 23:57:38,616 - lmdeploy - INFO - updated chat_template_onfig=ChatTemplateConfig(model_name='mistral', system=None, meta_instruction=None, eosys=None, user=None, eoh=None, assistant=N
40 | one, eoa=None, separator=None, capability=None, stop_words=None)                                                                                                                                 
41 | 2024-07-05 23:57:38,652 - lmdeploy - INFO - model_source: hf_model
42 | 
43 | ...
44 | ```
45 | 
46 | The server is now active at [http://localhost:3000](http://localhost:3000/). You can interact with it using the Swagger UI or in other different ways.
47 | 
48 | <details>
49 | 
50 | <summary>CURL</summary>
51 | 
52 | ```bash
53 | curl -X 'POST' \
54 |   'http://localhost:3000/generate' \
55 |   -H 'accept: text/event-stream' \
56 |   -H 'Content-Type: application/json' \
57 |   -d '{
58 |   "prompt": "Explain superconductors like I'\''m five years old",
59 |   "max_tokens": 1024
60 | }'
61 | ```
62 | 
63 | </details>
64 | 
65 | <details>
66 | 
67 | <summary>Python client</summary>
68 | 
69 | ```python
70 | import bentoml
71 | 
72 | with bentoml.SyncHTTPClient("http://localhost:3000") as client:
73 |     response_generator = client.generate(
74 |         prompt="Explain superconductors like I'm five years old",
75 |         max_tokens=1024
76 |     )
77 |     for response in response_generator:
78 |         print(response, end='')
79 | ```
80 | 
81 | </details>
82 | 
83 | ## Deploy to BentoCloud
84 | 
85 | After the Service is ready, you can deploy the application to BentoCloud for better management and scalability. [Sign up](https://www.bentoml.com/) if you haven't got a BentoCloud account.
86 | 
87 | Make sure you have [logged in to BentoCloud](https://docs.bentoml.com/en/latest/bentocloud/how-tos/manage-access-token.html), then run the following command to deploy it.
88 | 
89 | ```bash
90 | bentoml deploy .
91 | ```
92 | 
93 | Once the application is up and running on BentoCloud, you can access it via the exposed URL.
94 | 
95 | **Note**: For custom deployment in your own infrastructure, use [BentoML to generate an OCI-compliant image](https://docs.bentoml.com/en/latest/guides/containerization.html).
96 | 


--------------------------------------------------------------------------------
/mistral-7b-instruct/bentofile.yaml:
--------------------------------------------------------------------------------
 1 | service: "service:LMDeploy"
 2 | labels:
 3 |   owner: bentoml-team
 4 |   stage: demo
 5 | include:
 6 | - "*.py"
 7 | python:
 8 |   requirements_txt: "./requirements.txt"
 9 |   lock_packages: false
10 | envs:
11 |   - name: HF_TOKEN
12 | 


--------------------------------------------------------------------------------
/mistral-7b-instruct/requirements.txt:
--------------------------------------------------------------------------------
1 | bentoml>=1.3.0
2 | lmdeploy==0.5.1
3 | 


--------------------------------------------------------------------------------
/mistral-7b-instruct/service.py:
--------------------------------------------------------------------------------
 1 | import uuid
 2 | from typing import AsyncGenerator, Optional
 3 | 
 4 | import bentoml
 5 | from annotated_types import Ge, Le
 6 | from typing_extensions import Annotated
 7 | 
 8 | 
 9 | MAX_TOKENS = 1024
10 | SYSTEM_PROMPT = """
11 | You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
12 | 
13 | If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
14 | """
15 | 
16 | PROMPT_TEMPLATE = """<s>[INST]
17 | {system_prompt}
18 | {user_prompt} [/INST] """
19 | 
20 | MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.2"
21 | 
22 | 
23 | @bentoml.service(
24 |     name="bentolmdeploy-mistral-7b-insruct-service-benchmark",
25 |     traffic={
26 |         "timeout": 300,
27 |     },
28 |     resources={
29 |         "gpu": 1,
30 |         "gpu_type": "nvidia-l4",
31 |     },
32 | )
33 | class LMDeploy:
34 | 
35 |     def __init__(self) -> None:
36 |         from transformers import AutoTokenizer
37 |         from lmdeploy.serve.async_engine import AsyncEngine
38 |         from lmdeploy.messages import TurbomindEngineConfig
39 | 
40 |         engine_config = TurbomindEngineConfig(
41 |             model_name=MODEL_ID,
42 |             model_format="hf",
43 |             cache_max_entry_count=0.95,
44 |         )
45 |         self.engine = AsyncEngine(
46 |             MODEL_ID, backend_config=engine_config
47 |         )
48 | 
49 | 
50 |     @bentoml.api
51 |     async def generate(
52 |         self,
53 |         ctx: bentoml.Context,
54 |         prompt: str = "Explain superconductors in plain English",
55 |         system_prompt: Optional[str] = SYSTEM_PROMPT,
56 |         max_tokens: Annotated[int, Ge(128), Le(MAX_TOKENS)] = MAX_TOKENS,
57 |     ) -> AsyncGenerator[str, None]:
58 | 
59 |         from lmdeploy import GenerationConfig
60 | 
61 |         gen_config = GenerationConfig(max_new_tokens=max_tokens)
62 | 
63 |         if system_prompt is None:
64 |             system_prompt = SYSTEM_PROMPT
65 |         prompt = PROMPT_TEMPLATE.format(user_prompt=prompt, system_prompt=system_prompt)
66 | 
67 |         session_id = abs(uuid.uuid4().int >> 96)
68 |         stream = self.engine.generate(
69 |             prompt, session_id=session_id, gen_config=gen_config
70 |         )
71 | 
72 |         async for request_output in stream:
73 |             if await ctx.request.is_disconnected():
74 |                 await self.engine.stop_session(session_id)
75 |                 await self.engine.end_session(session_id)
76 |                 return
77 |             yield request_output.response
78 | 


--------------------------------------------------------------------------------