├── .gitignore ├── README.md ├── llama3.1-70b-instruct-awq ├── .bentoignore ├── README.md ├── bentofile.yaml ├── import_model.py ├── openai_endpoints.py ├── requirements.txt └── service.py ├── llama3.1-8b-instruct ├── .bentoignore ├── README.md ├── requirements.txt └── service.py └── mistral-7b-instruct ├── .bentoignore ├── README.md ├── bentofile.yaml ├── requirements.txt └── service.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | *.whl 162 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 |

Self-host LLMs with LMDeploy and BentoML

3 |
4 | 5 | This is a BentoML example project, showing you how to serve and deploy open-source Large Language Models (LLMs) using [LMDeploy](https://github.com/InternLM/lmdeploy), a toolkit for compressing, deploying, and serving LLMs. 6 | 7 | See [here](https://docs.bentoml.com/en/latest/examples/overview.html) for a full list of BentoML example projects. 8 | 9 | 💡 This example is served as a basis for advanced code customization, such as custom model, inference logic or LMDeploy options. For simple LLM hosting with OpenAI compatible endpoint without writing any code, see [OpenLLM](https://github.com/bentoml/OpenLLM). 10 | 11 | ## Prerequisites 12 | 13 | - You have installed Python 3.8+ and `pip`. See the [Python downloads page](https://www.python.org/downloads/) to learn more. 14 | - You have a basic understanding of key concepts in BentoML, such as Services. We recommend you read [Quickstart](https://docs.bentoml.com/en/1.2/get-started/quickstart.html) first. 15 | - If you want to test the Service locally, you need a Nvidia GPU with at least 20G VRAM. 16 | - This example uses Llama 3 8B Instruct. Make sure you have [gained access to the model](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct). 17 | - (Optional) We recommend you create a virtual environment for dependency isolation for this project. See the [Conda documentation](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html) or the [Python documentation](https://docs.python.org/3/library/venv.html) for details. 18 | 19 | ## Install dependencies 20 | 21 | ```bash 22 | git clone https://github.com/bentoml/BentoLMDeploy.git 23 | cd BentoLMDeploy/llama3.1-8b-instruct 24 | pip install -r requirements.txt 25 | ``` 26 | 27 | ## Download the model 28 | 29 | Run the script to download Llama 3. 30 | 31 | ```bash 32 | python import_model.py 33 | ``` 34 | 35 | ## Run the BentoML Service 36 | 37 | We have defined a BentoML Service in `service.py`. Run `bentoml serve` in your project directory to start the Service. 38 | 39 | ```bash 40 | $ bentoml serve . 41 | 42 | 2024-05-04T17:24:01+0800 [INFO] [cli] Starting production HTTP BentoServer from "service:LMDeploy" listening on http://localhost:3000 (Press CTRL+C to quit) 43 | 2024-05-04 17:24:03,239 - lmdeploy - INFO - input backend=turbomind, backend_config=TurbomindEngineConfig(model_name='meta-llama/Meta-Llama-3-8B-Instruct', model_format='hf', tp=1, session_len=None, max_batch_size=128, cach 44 | e_max_entry_count=0.9, cache_block_seq_len=64, quant_policy=0, rope_scaling_factor=0.0, use_logn_attn=False, download_dir=None, revision=None, max_prefill_token_num=8192, num_tokens_per_iter=0, max_prefill_iters=1) 45 | 2024-05-04 17:24:03,240 - lmdeploy - INFO - input chat_template_config=None 46 | 2024-05-04 17:24:03,339 - lmdeploy - INFO - updated chat_template_onfig=ChatTemplateConfig(model_name='llama3', system=None, meta_instruction=None, eosys=None, user=None, eoh=None, assistant=None, eoa=None, separator=None, 47 | capability=None, stop_words=None) 48 | 2024-05-04 17:24:03,359 - lmdeploy - WARNING - model_source: hf_model 49 | 2024-05-04 17:24:03,359 - lmdeploy - WARNING - model_name is deprecated in TurbomindEngineConfig and has no effect 50 | Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. 51 | Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. 52 | 2024-05-04 17:24:03,727 - lmdeploy - WARNING - model_config: 53 | 54 | ... 55 | ``` 56 | 57 | The server is now active at [http://localhost:3000](http://localhost:3000/). You can interact with it using the Swagger UI or in other different ways. 58 | 59 |
60 | 61 | CURL 62 | 63 | ```bash 64 | curl -X 'POST' \ 65 | 'http://localhost:3000/generate' \ 66 | -H 'accept: text/event-stream' \ 67 | -H 'Content-Type: application/json' \ 68 | -d '{ 69 | "prompt": "Explain superconductors like I'\''m five years old", 70 | "max_tokens": 1024 71 | }' 72 | ``` 73 | 74 |
75 | 76 |
77 | 78 | Python client 79 | 80 | ```python 81 | import bentoml 82 | 83 | with bentoml.SyncHTTPClient("http://localhost:3000") as client: 84 | response_generator = client.generate( 85 | prompt="Explain superconductors like I'm five years old", 86 | max_tokens=1024 87 | ) 88 | for response in response_generator: 89 | print(response, end='') 90 | ``` 91 | 92 |
93 | 94 | ## Deploy to BentoCloud 95 | 96 | After the Service is ready, you can deploy the application to BentoCloud for better management and scalability. [Sign up](https://www.bentoml.com/) if you haven't got a BentoCloud account. 97 | 98 | Make sure you have [logged in to BentoCloud](https://docs.bentoml.com/en/latest/bentocloud/how-tos/manage-access-token.html), then run the following command to deploy it. 99 | 100 | ```bash 101 | bentoml deploy . 102 | ``` 103 | 104 | Once the application is up and running on BentoCloud, you can access it via the exposed URL. 105 | 106 | **Note**: For custom deployment in your own infrastructure, use [BentoML to generate an OCI-compliant image](https://docs.bentoml.com/en/latest/guides/containerization.html). 107 | -------------------------------------------------------------------------------- /llama3.1-70b-instruct-awq/.bentoignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | *.py[cod] 3 | *$py.class 4 | .ipynb_checkpoints 5 | venv/ 6 | -------------------------------------------------------------------------------- /llama3.1-70b-instruct-awq/README.md: -------------------------------------------------------------------------------- 1 |
2 |

Self-host LLMs with LMDeploy and BentoML

3 |
4 | 5 | This is a BentoML example project, showing you how to serve and deploy open-source Large Language Models using [LMDeploy](https://github.com/InternLM/lmdeploy), a toolkit for compressing, deploying, and serving LLM. 6 | 7 | See [here](https://github.com/bentoml/BentoML?tab=readme-ov-file#%EF%B8%8F-what-you-can-build-with-bentoml) for a full list of BentoML example projects. 8 | 9 | 💡 This example is served as a basis for advanced code customization, such as custom model, inference logic or LMDeploy options. For simple LLM hosting with OpenAI compatible endpoint without writing any code, see [OpenLLM](https://github.com/bentoml/OpenLLM). 10 | 11 | 12 | ## Prerequisites 13 | 14 | - You have installed Python 3.8+ and `pip`. See the [Python downloads page](https://www.python.org/downloads/) to learn more. 15 | - You have a basic understanding of key concepts in BentoML, such as Services. We recommend you read [Quickstart](https://docs.bentoml.com/en/1.2/get-started/quickstart.html) first. 16 | - If you want to test the Service locally, you need a Nvidia GPU with at least 20G VRAM. 17 | - (Optional) We recommend you create a virtual environment for dependency isolation for this project. See the [Conda documentation](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html) or the [Python documentation](https://docs.python.org/3/library/venv.html) for details. 18 | 19 | ## Install dependencies 20 | 21 | ```bash 22 | git clone https://github.com/bentoml/BentoLMDeploy.git 23 | pip install -r requirements.txt 24 | ``` 25 | 26 | ## Download the model 27 | 28 | ```bash 29 | python import_model.py 30 | ``` 31 | 32 | ## Run the BentoML Service 33 | 34 | We have defined a BentoML Service in `service.py`. Run `bentoml serve` in your project directory to start the Service. 35 | 36 | ```bash 37 | $ bentoml serve . 38 | 39 | 2024-05-04T17:24:01+0800 [INFO] [cli] Starting production HTTP BentoServer from "service:LMDeploy" listening on http://localhost:3000 (Press CTRL+C to quit) 40 | 2024-05-04 17:24:03,239 - lmdeploy - INFO - input backend=turbomind, backend_config=TurbomindEngineConfig(model_name='meta-llama/Meta-Llama-3-8B-Instruct', model_format='hf', tp=1, session_len=None, max_batch_size=128, cach 41 | e_max_entry_count=0.9, cache_block_seq_len=64, quant_policy=0, rope_scaling_factor=0.0, use_logn_attn=False, download_dir=None, revision=None, max_prefill_token_num=8192, num_tokens_per_iter=0, max_prefill_iters=1) 42 | 2024-05-04 17:24:03,240 - lmdeploy - INFO - input chat_template_config=None 43 | 2024-05-04 17:24:03,339 - lmdeploy - INFO - updated chat_template_onfig=ChatTemplateConfig(model_name='llama3', system=None, meta_instruction=None, eosys=None, user=None, eoh=None, assistant=None, eoa=None, separator=None, 44 | capability=None, stop_words=None) 45 | 2024-05-04 17:24:03,359 - lmdeploy - WARNING - model_source: hf_model 46 | 2024-05-04 17:24:03,359 - lmdeploy - WARNING - model_name is deprecated in TurbomindEngineConfig and has no effect 47 | Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. 48 | Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. 49 | 2024-05-04 17:24:03,727 - lmdeploy - WARNING - model_config: 50 | 51 | ... 52 | ``` 53 | 54 | The server is now active at [http://localhost:3000](http://localhost:3000/). You can interact with it using the Swagger UI or in other different ways. 55 | 56 |
57 | 58 | CURL 59 | 60 | ```bash 61 | curl -X 'POST' \ 62 | 'http://localhost:3000/generate' \ 63 | -H 'accept: text/event-stream' \ 64 | -H 'Content-Type: application/json' \ 65 | -d '{ 66 | "prompt": "Explain superconductors like I'\''m five years old", 67 | }' 68 | ``` 69 | 70 |
71 | 72 |
73 | 74 | Python client 75 | 76 | ```python 77 | import bentoml 78 | 79 | with bentoml.SyncHTTPClient("http://localhost:3000") as client: 80 | response_generator = client.generate( 81 | prompt="Explain superconductors like I'm five years old", 82 | ) 83 | for response in response_generator: 84 | print(response) 85 | ``` 86 | 87 |
88 | 89 | 90 | ## Deploy to BentoCloud 91 | 92 | After the Service is ready, you can deploy the application to BentoCloud for better management and scalability. [Sign up](https://www.bentoml.com/) if you haven't got a BentoCloud account. 93 | 94 | Make sure you have [logged in to BentoCloud](https://docs.bentoml.com/en/latest/bentocloud/how-tos/manage-access-token.html), then run the following command to deploy it. 95 | 96 | ```bash 97 | bentoml deploy . 98 | ``` 99 | 100 | Once the application is up and running on BentoCloud, you can access it via the exposed URL. 101 | 102 | **Note**: For custom deployment in your own infrastructure, use [BentoML to generate an OCI-compliant image](https://docs.bentoml.com/en/latest/guides/containerization.html). 103 | 104 | 105 | ## Different LLM Models 106 | 107 | Besides the mistral-7b-instruct model, we have examples for other models in subdirectories of this repository. Below is a list of these models and links to the example subdirectories. 108 | 109 | - [Llama-2-7b-chat-hf](llama2-7b-chat/) 110 | - [Llama-3-8b-instruct](llama3-8b-instruct/) 111 | - [Mistral-7B-Instruct-v0.2](mistral-7b-instruct/) 112 | - [Mixtral-8x7B-Instruct-v0.1 with gptq quantization](mistral-7b-instruct/) 113 | - [Outlines integration](outlines-integration/) 114 | - [SOLAR-10.7B-v1.0](solar-10.7b-instruct/) 115 | 116 | 117 | ## LLM tools integration examples 118 | 119 | - Every model directory contains codes to add OpenAI compatible endpoints to the BentoML service. 120 | - [outlines-integration/](outlines-integration/) contains the code to integrate with [outlines](https://github.com/outlines-dev/outlines) for structured generation. 121 | -------------------------------------------------------------------------------- /llama3.1-70b-instruct-awq/bentofile.yaml: -------------------------------------------------------------------------------- 1 | service: "service:LMDeploy" 2 | labels: 3 | owner: bentoml-team 4 | stage: demo 5 | include: 6 | - "*.py" 7 | python: 8 | requirements_txt: "./requirements.txt" 9 | lock_packages: false 10 | -------------------------------------------------------------------------------- /llama3.1-70b-instruct-awq/import_model.py: -------------------------------------------------------------------------------- 1 | import bentoml 2 | 3 | MODEL_ID = "hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4" 4 | BENTO_MODEL_TAG = MODEL_ID.lower().replace("/", "--") 5 | 6 | def import_model(model_id, bento_model_tag): 7 | 8 | import torch 9 | from transformers import AutoModelForCausalLM, AutoTokenizer 10 | 11 | tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) 12 | model = AutoModelForCausalLM.from_pretrained( 13 | MODEL_ID, 14 | torch_dtype=torch.float16, 15 | low_cpu_mem_usage=True, 16 | ) 17 | 18 | with bentoml.models.create(bento_model_tag) as bento_model_ref: 19 | tokenizer.save_pretrained(bento_model_ref.path) 20 | model.save_pretrained(bento_model_ref.path) 21 | 22 | 23 | if __name__ == "__main__": 24 | import_model(MODEL_ID, BENTO_MODEL_TAG) 25 | -------------------------------------------------------------------------------- /llama3.1-70b-instruct-awq/openai_endpoints.py: -------------------------------------------------------------------------------- 1 | import fastapi 2 | import lmdeploy.serve.openai.api_server as lmdeploy_api_server 3 | 4 | openai_api_app = fastapi.FastAPI() 5 | 6 | OPENAI_ENDPOINTS = [ 7 | ["/chat/completions", lmdeploy_api_server.chat_completions_v1, ["POST"]], 8 | ["/completions", lmdeploy_api_server.completions_v1, ["POST"]], 9 | ["/models", lmdeploy_api_server.available_models, ["GET"]], 10 | ] 11 | 12 | for route, endpoint, methods in OPENAI_ENDPOINTS: 13 | openai_api_app.add_api_route( 14 | path=route, 15 | endpoint=endpoint, 16 | methods=methods, 17 | include_in_schema=True, 18 | ) 19 | -------------------------------------------------------------------------------- /llama3.1-70b-instruct-awq/requirements.txt: -------------------------------------------------------------------------------- 1 | autoawq==0.2.5 2 | bentoml>=1.3.0 3 | fastapi==0.111.1 4 | lmdeploy==0.5.2.post1 5 | -------------------------------------------------------------------------------- /llama3.1-70b-instruct-awq/service.py: -------------------------------------------------------------------------------- 1 | import uuid 2 | from typing import AsyncGenerator, Optional 3 | 4 | import bentoml 5 | from annotated_types import Ge, Le 6 | from typing_extensions import Annotated 7 | 8 | from openai_endpoints import openai_api_app 9 | 10 | 11 | MAX_TOKENS = 1024 12 | SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. 13 | 14 | If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""" 15 | 16 | PROMPT_TEMPLATE = """<|begin_of_text|><|start_header_id|>system<|end_header_id|> 17 | 18 | {system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|> 19 | 20 | {user_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|> 21 | 22 | """ 23 | 24 | MODEL_ID = "hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4" 25 | BENTO_MODEL_TAG = MODEL_ID.lower().replace("/", "--") 26 | 27 | 28 | @bentoml.mount_asgi_app(openai_api_app, path="/v1") 29 | @bentoml.service( 30 | name="bentolmdeploy-llama3.1-70b-instruct-awq-service", 31 | traffic={ 32 | "timeout": 300, 33 | }, 34 | resources={ 35 | "gpu": 1, 36 | "gpu_type": "nvidia-a100-80gb", 37 | }, 38 | ) 39 | class LMDeploy: 40 | 41 | bento_model_ref = bentoml.models.get(BENTO_MODEL_TAG) 42 | 43 | def __init__(self) -> None: 44 | from transformers import AutoTokenizer 45 | from lmdeploy.serve.async_engine import AsyncEngine 46 | from lmdeploy.messages import TurbomindEngineConfig 47 | 48 | engine_config = TurbomindEngineConfig( 49 | model_name=MODEL_ID, 50 | model_format="awq", 51 | cache_max_entry_count=0.85, 52 | enable_prefix_caching=True, 53 | ) 54 | self.engine = AsyncEngine( 55 | self.bento_model_ref.path, backend_config=engine_config 56 | ) 57 | 58 | import lmdeploy.serve.openai.api_server as lmdeploy_api_sever 59 | lmdeploy_api_sever.VariableInterface.async_engine = self.engine 60 | 61 | tokenizer = AutoTokenizer.from_pretrained(self.bento_model_ref.path) 62 | self.stop_tokens = [ 63 | tokenizer.convert_ids_to_tokens( 64 | tokenizer.eos_token_id, 65 | ), 66 | "<|eot_id|>", 67 | ] 68 | 69 | 70 | @bentoml.api 71 | async def generate( 72 | self, 73 | ctx: bentoml.Context, 74 | prompt: str = "Explain superconductors in plain English", 75 | system_prompt: Optional[str] = SYSTEM_PROMPT, 76 | max_tokens: Annotated[int, Ge(128), Le(MAX_TOKENS)] = MAX_TOKENS, 77 | ) -> AsyncGenerator[str, None]: 78 | 79 | from lmdeploy import GenerationConfig 80 | 81 | gen_config = GenerationConfig( 82 | max_new_tokens=max_tokens, stop_words=self.stop_tokens, 83 | ) 84 | 85 | if system_prompt is None: 86 | system_prompt = SYSTEM_PROMPT 87 | prompt = PROMPT_TEMPLATE.format(user_prompt=prompt, system_prompt=system_prompt) 88 | 89 | session_id = abs(uuid.uuid4().int >> 96) 90 | stream = self.engine.generate( 91 | prompt, session_id=session_id, gen_config=gen_config 92 | ) 93 | 94 | async for request_output in stream: 95 | if await ctx.request.is_disconnected(): 96 | await self.engine.stop_session(session_id) 97 | await self.engine.end_session(session_id) 98 | return 99 | yield request_output.response 100 | -------------------------------------------------------------------------------- /llama3.1-8b-instruct/.bentoignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | *.py[cod] 3 | *$py.class 4 | .ipynb_checkpoints 5 | venv/ 6 | -------------------------------------------------------------------------------- /llama3.1-8b-instruct/README.md: -------------------------------------------------------------------------------- 1 |
2 |

Self-host LLMs with LMDeploy and BentoML

3 |
4 | 5 | This is a BentoML example project, showing you how to serve and deploy open-source Large Language Models (LLMs) using [LMDeploy](https://github.com/InternLM/lmdeploy), a toolkit for compressing, deploying, and serving LLMs. 6 | 7 | See [here](https://docs.bentoml.com/en/latest/examples/overview.html) for a full list of BentoML example projects. 8 | 9 | 💡 This example is served as a basis for advanced code customization, such as custom model, inference logic or LMDeploy options. For simple LLM hosting with OpenAI compatible endpoint without writing any code, see [OpenLLM](https://github.com/bentoml/OpenLLM). 10 | 11 | ## Prerequisites 12 | 13 | - You have installed Python 3.8+ and `pip`. See the [Python downloads page](https://www.python.org/downloads/) to learn more. 14 | - You have a basic understanding of key concepts in BentoML, such as Services. We recommend you read [Quickstart](https://docs.bentoml.com/en/1.2/get-started/quickstart.html) first. 15 | - If you want to test the Service locally, you need a Nvidia GPU with at least 20G VRAM. 16 | - This example uses Llama 3 8B Instruct. Make sure you have [gained access to the model](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct). 17 | - (Optional) We recommend you create a virtual environment for dependency isolation for this project. See the [Conda documentation](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html) or the [Python documentation](https://docs.python.org/3/library/venv.html) for details. 18 | 19 | ## Install dependencies 20 | 21 | ```bash 22 | git clone https://github.com/bentoml/BentoLMDeploy.git 23 | cd BentoLMDeploy/llama3-8b-instruct 24 | pip install -r requirements.txt 25 | ``` 26 | 27 | ## Download the model 28 | 29 | Run the script to download Llama 3. 30 | 31 | ```bash 32 | python import_model.py 33 | ``` 34 | 35 | ## Run the BentoML Service 36 | 37 | We have defined a BentoML Service in `service.py`. Run `bentoml serve` in your project directory to start the Service. 38 | 39 | ```bash 40 | $ bentoml serve . 41 | 42 | 2024-05-04T17:24:01+0800 [INFO] [cli] Starting production HTTP BentoServer from "service:LMDeploy" listening on http://localhost:3000 (Press CTRL+C to quit) 43 | 2024-05-04 17:24:03,239 - lmdeploy - INFO - input backend=turbomind, backend_config=TurbomindEngineConfig(model_name='meta-llama/Meta-Llama-3-8B-Instruct', model_format='hf', tp=1, session_len=None, max_batch_size=128, cach 44 | e_max_entry_count=0.9, cache_block_seq_len=64, quant_policy=0, rope_scaling_factor=0.0, use_logn_attn=False, download_dir=None, revision=None, max_prefill_token_num=8192, num_tokens_per_iter=0, max_prefill_iters=1) 45 | 2024-05-04 17:24:03,240 - lmdeploy - INFO - input chat_template_config=None 46 | 2024-05-04 17:24:03,339 - lmdeploy - INFO - updated chat_template_onfig=ChatTemplateConfig(model_name='llama3', system=None, meta_instruction=None, eosys=None, user=None, eoh=None, assistant=None, eoa=None, separator=None, 47 | capability=None, stop_words=None) 48 | 2024-05-04 17:24:03,359 - lmdeploy - WARNING - model_source: hf_model 49 | 2024-05-04 17:24:03,359 - lmdeploy - WARNING - model_name is deprecated in TurbomindEngineConfig and has no effect 50 | Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. 51 | Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. 52 | 2024-05-04 17:24:03,727 - lmdeploy - WARNING - model_config: 53 | 54 | ... 55 | ``` 56 | 57 | The server is now active at [http://localhost:3000](http://localhost:3000/). You can interact with it using the Swagger UI or in other different ways. 58 | 59 |
60 | 61 | CURL 62 | 63 | ```bash 64 | curl -X 'POST' \ 65 | 'http://localhost:3000/generate' \ 66 | -H 'accept: text/event-stream' \ 67 | -H 'Content-Type: application/json' \ 68 | -d '{ 69 | "prompt": "Explain superconductors like I'\''m five years old", 70 | "max_tokens": 1024 71 | }' 72 | ``` 73 | 74 |
75 | 76 |
77 | 78 | Python client 79 | 80 | ```python 81 | import bentoml 82 | 83 | with bentoml.SyncHTTPClient("http://localhost:3000") as client: 84 | response_generator = client.generate( 85 | prompt="Explain superconductors like I'm five years old", 86 | max_tokens=1024 87 | ) 88 | for response in response_generator: 89 | print(response, end='') 90 | ``` 91 | 92 |
93 | 94 | ## Deploy to BentoCloud 95 | 96 | After the Service is ready, you can deploy the application to BentoCloud for better management and scalability. [Sign up](https://www.bentoml.com/) if you haven't got a BentoCloud account. 97 | 98 | Make sure you have [logged in to BentoCloud](https://docs.bentoml.com/en/latest/bentocloud/how-tos/manage-access-token.html), then run the following command to deploy it. 99 | 100 | ```bash 101 | bentoml deploy . 102 | ``` 103 | 104 | Once the application is up and running on BentoCloud, you can access it via the exposed URL. 105 | 106 | **Note**: For custom deployment in your own infrastructure, use [BentoML to generate an OCI-compliant image](https://docs.bentoml.com/en/latest/guides/containerization.html). 107 | -------------------------------------------------------------------------------- /llama3.1-8b-instruct/requirements.txt: -------------------------------------------------------------------------------- 1 | bentoml>=1.4.3 2 | fastapi==0.115.6 3 | lmdeploy==0.7.1 4 | -------------------------------------------------------------------------------- /llama3.1-8b-instruct/service.py: -------------------------------------------------------------------------------- 1 | import uuid 2 | from typing import AsyncGenerator, Optional 3 | 4 | import bentoml 5 | import fastapi 6 | from annotated_types import Ge, Le 7 | from typing_extensions import Annotated 8 | 9 | openai_api_app = fastapi.FastAPI() 10 | 11 | 12 | MAX_SESSION_LEN = 2048 13 | MAX_TOKENS = 1024 14 | SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. 15 | 16 | If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""" 17 | 18 | MODEL_ID = "meta-llama/Meta-Llama-3.1-8B-Instruct" 19 | 20 | 21 | @bentoml.asgi_app(openai_api_app, path="/v1") 22 | @bentoml.service( 23 | name="bentolmdeploy-llama3.1-8b-insruct-service", 24 | image=bentoml.images.PythonImage(python_version="3.11").requirements_file("requirements.txt"), 25 | traffic={ 26 | "timeout": 300, 27 | }, 28 | resources={ 29 | "gpu": 1, 30 | "gpu_type": "nvidia-l4", 31 | }, 32 | ) 33 | class LMDeploy: 34 | hf_model = bentoml.models.HuggingFaceModel(MODEL_ID) 35 | 36 | def __init__(self) -> None: 37 | from transformers import AutoTokenizer 38 | from lmdeploy import ChatTemplateConfig 39 | from lmdeploy.serve.async_engine import AsyncEngine 40 | from lmdeploy.messages import TurbomindEngineConfig 41 | 42 | engine_config = TurbomindEngineConfig( 43 | model_name=MODEL_ID, 44 | model_format="hf", 45 | cache_max_entry_count=0.9, 46 | enable_prefix_caching=True, 47 | session_len=MAX_SESSION_LEN, 48 | ) 49 | self.engine = AsyncEngine( 50 | self.hf_model, 51 | backend_config=engine_config, 52 | model_name=MODEL_ID, 53 | chat_template_config=ChatTemplateConfig("llama3_1"), 54 | ) 55 | 56 | self.tokenizer = AutoTokenizer.from_pretrained(self.hf_model) 57 | self.stop_tokens = [ 58 | self.tokenizer.convert_ids_to_tokens( 59 | self.tokenizer.eos_token_id, 60 | ), 61 | "<|eot_id|>", 62 | ] 63 | 64 | import lmdeploy.serve.openai.api_server as lmdeploy_api_server 65 | lmdeploy_api_server.VariableInterface.async_engine = self.engine 66 | 67 | OPENAI_ENDPOINTS = [ 68 | ["/chat/completions", lmdeploy_api_server.chat_completions_v1, ["POST"]], 69 | ["/completions", lmdeploy_api_server.completions_v1, ["POST"]], 70 | ["/models", lmdeploy_api_server.available_models, ["GET"]], 71 | ] 72 | 73 | for route, endpoint, methods in OPENAI_ENDPOINTS: 74 | openai_api_app.add_api_route( 75 | path=route, 76 | endpoint=endpoint, 77 | methods=methods, 78 | include_in_schema=True, 79 | ) 80 | 81 | 82 | @bentoml.api 83 | async def generate( 84 | self, 85 | ctx: bentoml.Context, 86 | prompt: str = "Explain superconductors in plain English", 87 | system_prompt: Optional[str] = SYSTEM_PROMPT, 88 | max_tokens: Annotated[int, Ge(128), Le(MAX_TOKENS)] = MAX_TOKENS, 89 | ) -> AsyncGenerator[str, None]: 90 | 91 | from lmdeploy import GenerationConfig 92 | 93 | gen_config = GenerationConfig( 94 | max_new_tokens=max_tokens, stop_words=self.stop_tokens, 95 | ) 96 | 97 | if system_prompt is None: 98 | system_prompt = SYSTEM_PROMPT 99 | 100 | messages = [ 101 | dict(role="system", content=system_prompt), 102 | dict(role="user", content=prompt), 103 | ] 104 | 105 | prompt = self.tokenizer.apply_chat_template( 106 | conversation=messages, 107 | add_generation_prompt=True, 108 | tokenize=False, 109 | ) 110 | 111 | session_id = abs(uuid.uuid4().int >> 96) 112 | stream = self.engine.generate( 113 | prompt, session_id=session_id, gen_config=gen_config 114 | ) 115 | 116 | async for request_output in stream: 117 | if await ctx.request.is_disconnected(): 118 | await self.engine.stop_session(session_id) 119 | await self.engine.end_session(session_id) 120 | return 121 | yield request_output.response 122 | -------------------------------------------------------------------------------- /mistral-7b-instruct/.bentoignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | *.py[cod] 3 | *$py.class 4 | .ipynb_checkpoints 5 | venv/ 6 | -------------------------------------------------------------------------------- /mistral-7b-instruct/README.md: -------------------------------------------------------------------------------- 1 |
2 |

Self-host LLMs with LMDeploy and BentoML

3 |
4 | 5 | This is a BentoML example project, showing you how to serve and deploy open-source Large Language Models (LLMs) using [LMDeploy](https://github.com/InternLM/lmdeploy), a toolkit for compressing, deploying, and serving LLMs. 6 | 7 | See [here](https://docs.bentoml.com/en/latest/examples/overview.html) for a full list of BentoML example projects. 8 | 9 | 💡 This example is served as a basis for advanced code customization, such as custom model, inference logic or LMDeploy options. For simple LLM hosting with OpenAI compatible endpoint without writing any code, see [OpenLLM](https://github.com/bentoml/OpenLLM). 10 | 11 | ## Prerequisites 12 | 13 | - You have installed Python 3.8+ and `pip`. See the [Python downloads page](https://www.python.org/downloads/) to learn more. 14 | - You have a basic understanding of key concepts in BentoML, such as Services. We recommend you read [Quickstart](https://docs.bentoml.com/en/1.2/get-started/quickstart.html) first. 15 | - If you want to test the Service locally, you need a Nvidia GPU with at least 24G VRAM. 16 | - This example uses Mistral 7B Instruct. Make sure you have [gained access to the model](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2). 17 | - (Optional) We recommend you create a virtual environment for dependency isolation for this project. See the [Conda documentation](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html) or the [Python documentation](https://docs.python.org/3/library/venv.html) for details. 18 | 19 | ## Install dependencies 20 | 21 | ```bash 22 | git clone https://github.com/bentoml/BentoLMDeploy.git 23 | cd BentoLMDeploy/mistral-7b-instruct 24 | pip install -r requirements.txt 25 | ``` 26 | 27 | ## Run the BentoML Service 28 | 29 | We have defined a BentoML Service in `service.py`. Run `bentoml serve` in your project directory to start the Service. 30 | 31 | ```bash 32 | $ bentoml serve . 33 | 34 | 2024-07-05T23:57:36+0800 [INFO] [cli] Starting production HTTP BentoServer from "service:LMDeploy" listening on http://localhost:3000 (Press CTRL+C to quit) 35 | 2024-07-05 23:57:38,582 - lmdeploy - INFO - input backend=turbomind, backend_config=TurbomindEngineConfig(model_name='mistralai/Mistral-7B-Instruct-v0.2', model_format='hf', tp=1, session_len=N 36 | one, max_batch_size=128, cache_max_entry_count=0.95, cache_block_seq_len=64, enable_prefix_caching=False, quant_policy=0, rope_scaling_factor=0.0, use_logn_attn=False, download_dir=None, revisi 37 | on=None, max_prefill_token_num=8192, num_tokens_per_iter=0, max_prefill_iters=1) 38 | 2024-07-05 23:57:38,582 - lmdeploy - INFO - input chat_template_config=None 39 | 2024-07-05 23:57:38,616 - lmdeploy - INFO - updated chat_template_onfig=ChatTemplateConfig(model_name='mistral', system=None, meta_instruction=None, eosys=None, user=None, eoh=None, assistant=N 40 | one, eoa=None, separator=None, capability=None, stop_words=None) 41 | 2024-07-05 23:57:38,652 - lmdeploy - INFO - model_source: hf_model 42 | 43 | ... 44 | ``` 45 | 46 | The server is now active at [http://localhost:3000](http://localhost:3000/). You can interact with it using the Swagger UI or in other different ways. 47 | 48 |
49 | 50 | CURL 51 | 52 | ```bash 53 | curl -X 'POST' \ 54 | 'http://localhost:3000/generate' \ 55 | -H 'accept: text/event-stream' \ 56 | -H 'Content-Type: application/json' \ 57 | -d '{ 58 | "prompt": "Explain superconductors like I'\''m five years old", 59 | "max_tokens": 1024 60 | }' 61 | ``` 62 | 63 |
64 | 65 |
66 | 67 | Python client 68 | 69 | ```python 70 | import bentoml 71 | 72 | with bentoml.SyncHTTPClient("http://localhost:3000") as client: 73 | response_generator = client.generate( 74 | prompt="Explain superconductors like I'm five years old", 75 | max_tokens=1024 76 | ) 77 | for response in response_generator: 78 | print(response, end='') 79 | ``` 80 | 81 |
82 | 83 | ## Deploy to BentoCloud 84 | 85 | After the Service is ready, you can deploy the application to BentoCloud for better management and scalability. [Sign up](https://www.bentoml.com/) if you haven't got a BentoCloud account. 86 | 87 | Make sure you have [logged in to BentoCloud](https://docs.bentoml.com/en/latest/bentocloud/how-tos/manage-access-token.html), then run the following command to deploy it. 88 | 89 | ```bash 90 | bentoml deploy . 91 | ``` 92 | 93 | Once the application is up and running on BentoCloud, you can access it via the exposed URL. 94 | 95 | **Note**: For custom deployment in your own infrastructure, use [BentoML to generate an OCI-compliant image](https://docs.bentoml.com/en/latest/guides/containerization.html). 96 | -------------------------------------------------------------------------------- /mistral-7b-instruct/bentofile.yaml: -------------------------------------------------------------------------------- 1 | service: "service:LMDeploy" 2 | labels: 3 | owner: bentoml-team 4 | stage: demo 5 | include: 6 | - "*.py" 7 | python: 8 | requirements_txt: "./requirements.txt" 9 | lock_packages: false 10 | envs: 11 | - name: HF_TOKEN 12 | -------------------------------------------------------------------------------- /mistral-7b-instruct/requirements.txt: -------------------------------------------------------------------------------- 1 | bentoml>=1.3.0 2 | lmdeploy==0.5.1 3 | -------------------------------------------------------------------------------- /mistral-7b-instruct/service.py: -------------------------------------------------------------------------------- 1 | import uuid 2 | from typing import AsyncGenerator, Optional 3 | 4 | import bentoml 5 | from annotated_types import Ge, Le 6 | from typing_extensions import Annotated 7 | 8 | 9 | MAX_TOKENS = 1024 10 | SYSTEM_PROMPT = """ 11 | You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. 12 | 13 | If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information. 14 | """ 15 | 16 | PROMPT_TEMPLATE = """[INST] 17 | {system_prompt} 18 | {user_prompt} [/INST] """ 19 | 20 | MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.2" 21 | 22 | 23 | @bentoml.service( 24 | name="bentolmdeploy-mistral-7b-insruct-service-benchmark", 25 | traffic={ 26 | "timeout": 300, 27 | }, 28 | resources={ 29 | "gpu": 1, 30 | "gpu_type": "nvidia-l4", 31 | }, 32 | ) 33 | class LMDeploy: 34 | 35 | def __init__(self) -> None: 36 | from transformers import AutoTokenizer 37 | from lmdeploy.serve.async_engine import AsyncEngine 38 | from lmdeploy.messages import TurbomindEngineConfig 39 | 40 | engine_config = TurbomindEngineConfig( 41 | model_name=MODEL_ID, 42 | model_format="hf", 43 | cache_max_entry_count=0.95, 44 | ) 45 | self.engine = AsyncEngine( 46 | MODEL_ID, backend_config=engine_config 47 | ) 48 | 49 | 50 | @bentoml.api 51 | async def generate( 52 | self, 53 | ctx: bentoml.Context, 54 | prompt: str = "Explain superconductors in plain English", 55 | system_prompt: Optional[str] = SYSTEM_PROMPT, 56 | max_tokens: Annotated[int, Ge(128), Le(MAX_TOKENS)] = MAX_TOKENS, 57 | ) -> AsyncGenerator[str, None]: 58 | 59 | from lmdeploy import GenerationConfig 60 | 61 | gen_config = GenerationConfig(max_new_tokens=max_tokens) 62 | 63 | if system_prompt is None: 64 | system_prompt = SYSTEM_PROMPT 65 | prompt = PROMPT_TEMPLATE.format(user_prompt=prompt, system_prompt=system_prompt) 66 | 67 | session_id = abs(uuid.uuid4().int >> 96) 68 | stream = self.engine.generate( 69 | prompt, session_id=session_id, gen_config=gen_config 70 | ) 71 | 72 | async for request_output in stream: 73 | if await ctx.request.is_disconnected(): 74 | await self.engine.stop_session(session_id) 75 | await self.engine.end_session(session_id) 76 | return 77 | yield request_output.response 78 | --------------------------------------------------------------------------------