├── .gitignore
├── README.md
├── llama3.1-70b-instruct-awq
├── .bentoignore
├── README.md
├── bentofile.yaml
├── import_model.py
├── openai_endpoints.py
├── requirements.txt
└── service.py
├── llama3.1-8b-instruct
├── .bentoignore
├── README.md
├── requirements.txt
└── service.py
└── mistral-7b-instruct
├── .bentoignore
├── README.md
├── bentofile.yaml
├── requirements.txt
└── service.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 |
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 |
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 |
119 | # SageMath parsed files
120 | *.sage.py
121 |
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 |
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 |
135 | # Rope project settings
136 | .ropeproject
137 |
138 | # mkdocs documentation
139 | /site
140 |
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 |
146 | # Pyre type checker
147 | .pyre/
148 |
149 | # pytype static type analyzer
150 | .pytype/
151 |
152 | # Cython debug symbols
153 | cython_debug/
154 |
155 | # PyCharm
156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | # and can be added to the global gitignore or merged into this file. For a more nuclear
159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | *.whl
162 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
Self-host LLMs with LMDeploy and BentoML
3 |
4 |
5 | This is a BentoML example project, showing you how to serve and deploy open-source Large Language Models (LLMs) using [LMDeploy](https://github.com/InternLM/lmdeploy), a toolkit for compressing, deploying, and serving LLMs.
6 |
7 | See [here](https://docs.bentoml.com/en/latest/examples/overview.html) for a full list of BentoML example projects.
8 |
9 | 💡 This example is served as a basis for advanced code customization, such as custom model, inference logic or LMDeploy options. For simple LLM hosting with OpenAI compatible endpoint without writing any code, see [OpenLLM](https://github.com/bentoml/OpenLLM).
10 |
11 | ## Prerequisites
12 |
13 | - You have installed Python 3.8+ and `pip`. See the [Python downloads page](https://www.python.org/downloads/) to learn more.
14 | - You have a basic understanding of key concepts in BentoML, such as Services. We recommend you read [Quickstart](https://docs.bentoml.com/en/1.2/get-started/quickstart.html) first.
15 | - If you want to test the Service locally, you need a Nvidia GPU with at least 20G VRAM.
16 | - This example uses Llama 3 8B Instruct. Make sure you have [gained access to the model](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct).
17 | - (Optional) We recommend you create a virtual environment for dependency isolation for this project. See the [Conda documentation](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html) or the [Python documentation](https://docs.python.org/3/library/venv.html) for details.
18 |
19 | ## Install dependencies
20 |
21 | ```bash
22 | git clone https://github.com/bentoml/BentoLMDeploy.git
23 | cd BentoLMDeploy/llama3.1-8b-instruct
24 | pip install -r requirements.txt
25 | ```
26 |
27 | ## Download the model
28 |
29 | Run the script to download Llama 3.
30 |
31 | ```bash
32 | python import_model.py
33 | ```
34 |
35 | ## Run the BentoML Service
36 |
37 | We have defined a BentoML Service in `service.py`. Run `bentoml serve` in your project directory to start the Service.
38 |
39 | ```bash
40 | $ bentoml serve .
41 |
42 | 2024-05-04T17:24:01+0800 [INFO] [cli] Starting production HTTP BentoServer from "service:LMDeploy" listening on http://localhost:3000 (Press CTRL+C to quit)
43 | 2024-05-04 17:24:03,239 - lmdeploy - INFO - input backend=turbomind, backend_config=TurbomindEngineConfig(model_name='meta-llama/Meta-Llama-3-8B-Instruct', model_format='hf', tp=1, session_len=None, max_batch_size=128, cach
44 | e_max_entry_count=0.9, cache_block_seq_len=64, quant_policy=0, rope_scaling_factor=0.0, use_logn_attn=False, download_dir=None, revision=None, max_prefill_token_num=8192, num_tokens_per_iter=0, max_prefill_iters=1)
45 | 2024-05-04 17:24:03,240 - lmdeploy - INFO - input chat_template_config=None
46 | 2024-05-04 17:24:03,339 - lmdeploy - INFO - updated chat_template_onfig=ChatTemplateConfig(model_name='llama3', system=None, meta_instruction=None, eosys=None, user=None, eoh=None, assistant=None, eoa=None, separator=None,
47 | capability=None, stop_words=None)
48 | 2024-05-04 17:24:03,359 - lmdeploy - WARNING - model_source: hf_model
49 | 2024-05-04 17:24:03,359 - lmdeploy - WARNING - model_name is deprecated in TurbomindEngineConfig and has no effect
50 | Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
51 | Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
52 | 2024-05-04 17:24:03,727 - lmdeploy - WARNING - model_config:
53 |
54 | ...
55 | ```
56 |
57 | The server is now active at [http://localhost:3000](http://localhost:3000/). You can interact with it using the Swagger UI or in other different ways.
58 |
59 |
60 |
61 | CURL
62 |
63 | ```bash
64 | curl -X 'POST' \
65 | 'http://localhost:3000/generate' \
66 | -H 'accept: text/event-stream' \
67 | -H 'Content-Type: application/json' \
68 | -d '{
69 | "prompt": "Explain superconductors like I'\''m five years old",
70 | "max_tokens": 1024
71 | }'
72 | ```
73 |
74 |
75 |
76 |
77 |
78 | Python client
79 |
80 | ```python
81 | import bentoml
82 |
83 | with bentoml.SyncHTTPClient("http://localhost:3000") as client:
84 | response_generator = client.generate(
85 | prompt="Explain superconductors like I'm five years old",
86 | max_tokens=1024
87 | )
88 | for response in response_generator:
89 | print(response, end='')
90 | ```
91 |
92 |
93 |
94 | ## Deploy to BentoCloud
95 |
96 | After the Service is ready, you can deploy the application to BentoCloud for better management and scalability. [Sign up](https://www.bentoml.com/) if you haven't got a BentoCloud account.
97 |
98 | Make sure you have [logged in to BentoCloud](https://docs.bentoml.com/en/latest/bentocloud/how-tos/manage-access-token.html), then run the following command to deploy it.
99 |
100 | ```bash
101 | bentoml deploy .
102 | ```
103 |
104 | Once the application is up and running on BentoCloud, you can access it via the exposed URL.
105 |
106 | **Note**: For custom deployment in your own infrastructure, use [BentoML to generate an OCI-compliant image](https://docs.bentoml.com/en/latest/guides/containerization.html).
107 |
--------------------------------------------------------------------------------
/llama3.1-70b-instruct-awq/.bentoignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | *.py[cod]
3 | *$py.class
4 | .ipynb_checkpoints
5 | venv/
6 |
--------------------------------------------------------------------------------
/llama3.1-70b-instruct-awq/README.md:
--------------------------------------------------------------------------------
1 |
2 |
Self-host LLMs with LMDeploy and BentoML
3 |
4 |
5 | This is a BentoML example project, showing you how to serve and deploy open-source Large Language Models using [LMDeploy](https://github.com/InternLM/lmdeploy), a toolkit for compressing, deploying, and serving LLM.
6 |
7 | See [here](https://github.com/bentoml/BentoML?tab=readme-ov-file#%EF%B8%8F-what-you-can-build-with-bentoml) for a full list of BentoML example projects.
8 |
9 | 💡 This example is served as a basis for advanced code customization, such as custom model, inference logic or LMDeploy options. For simple LLM hosting with OpenAI compatible endpoint without writing any code, see [OpenLLM](https://github.com/bentoml/OpenLLM).
10 |
11 |
12 | ## Prerequisites
13 |
14 | - You have installed Python 3.8+ and `pip`. See the [Python downloads page](https://www.python.org/downloads/) to learn more.
15 | - You have a basic understanding of key concepts in BentoML, such as Services. We recommend you read [Quickstart](https://docs.bentoml.com/en/1.2/get-started/quickstart.html) first.
16 | - If you want to test the Service locally, you need a Nvidia GPU with at least 20G VRAM.
17 | - (Optional) We recommend you create a virtual environment for dependency isolation for this project. See the [Conda documentation](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html) or the [Python documentation](https://docs.python.org/3/library/venv.html) for details.
18 |
19 | ## Install dependencies
20 |
21 | ```bash
22 | git clone https://github.com/bentoml/BentoLMDeploy.git
23 | pip install -r requirements.txt
24 | ```
25 |
26 | ## Download the model
27 |
28 | ```bash
29 | python import_model.py
30 | ```
31 |
32 | ## Run the BentoML Service
33 |
34 | We have defined a BentoML Service in `service.py`. Run `bentoml serve` in your project directory to start the Service.
35 |
36 | ```bash
37 | $ bentoml serve .
38 |
39 | 2024-05-04T17:24:01+0800 [INFO] [cli] Starting production HTTP BentoServer from "service:LMDeploy" listening on http://localhost:3000 (Press CTRL+C to quit)
40 | 2024-05-04 17:24:03,239 - lmdeploy - INFO - input backend=turbomind, backend_config=TurbomindEngineConfig(model_name='meta-llama/Meta-Llama-3-8B-Instruct', model_format='hf', tp=1, session_len=None, max_batch_size=128, cach
41 | e_max_entry_count=0.9, cache_block_seq_len=64, quant_policy=0, rope_scaling_factor=0.0, use_logn_attn=False, download_dir=None, revision=None, max_prefill_token_num=8192, num_tokens_per_iter=0, max_prefill_iters=1)
42 | 2024-05-04 17:24:03,240 - lmdeploy - INFO - input chat_template_config=None
43 | 2024-05-04 17:24:03,339 - lmdeploy - INFO - updated chat_template_onfig=ChatTemplateConfig(model_name='llama3', system=None, meta_instruction=None, eosys=None, user=None, eoh=None, assistant=None, eoa=None, separator=None,
44 | capability=None, stop_words=None)
45 | 2024-05-04 17:24:03,359 - lmdeploy - WARNING - model_source: hf_model
46 | 2024-05-04 17:24:03,359 - lmdeploy - WARNING - model_name is deprecated in TurbomindEngineConfig and has no effect
47 | Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
48 | Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
49 | 2024-05-04 17:24:03,727 - lmdeploy - WARNING - model_config:
50 |
51 | ...
52 | ```
53 |
54 | The server is now active at [http://localhost:3000](http://localhost:3000/). You can interact with it using the Swagger UI or in other different ways.
55 |
56 |
57 |
58 | CURL
59 |
60 | ```bash
61 | curl -X 'POST' \
62 | 'http://localhost:3000/generate' \
63 | -H 'accept: text/event-stream' \
64 | -H 'Content-Type: application/json' \
65 | -d '{
66 | "prompt": "Explain superconductors like I'\''m five years old",
67 | }'
68 | ```
69 |
70 |
71 |
72 |
73 |
74 | Python client
75 |
76 | ```python
77 | import bentoml
78 |
79 | with bentoml.SyncHTTPClient("http://localhost:3000") as client:
80 | response_generator = client.generate(
81 | prompt="Explain superconductors like I'm five years old",
82 | )
83 | for response in response_generator:
84 | print(response)
85 | ```
86 |
87 |
88 |
89 |
90 | ## Deploy to BentoCloud
91 |
92 | After the Service is ready, you can deploy the application to BentoCloud for better management and scalability. [Sign up](https://www.bentoml.com/) if you haven't got a BentoCloud account.
93 |
94 | Make sure you have [logged in to BentoCloud](https://docs.bentoml.com/en/latest/bentocloud/how-tos/manage-access-token.html), then run the following command to deploy it.
95 |
96 | ```bash
97 | bentoml deploy .
98 | ```
99 |
100 | Once the application is up and running on BentoCloud, you can access it via the exposed URL.
101 |
102 | **Note**: For custom deployment in your own infrastructure, use [BentoML to generate an OCI-compliant image](https://docs.bentoml.com/en/latest/guides/containerization.html).
103 |
104 |
105 | ## Different LLM Models
106 |
107 | Besides the mistral-7b-instruct model, we have examples for other models in subdirectories of this repository. Below is a list of these models and links to the example subdirectories.
108 |
109 | - [Llama-2-7b-chat-hf](llama2-7b-chat/)
110 | - [Llama-3-8b-instruct](llama3-8b-instruct/)
111 | - [Mistral-7B-Instruct-v0.2](mistral-7b-instruct/)
112 | - [Mixtral-8x7B-Instruct-v0.1 with gptq quantization](mistral-7b-instruct/)
113 | - [Outlines integration](outlines-integration/)
114 | - [SOLAR-10.7B-v1.0](solar-10.7b-instruct/)
115 |
116 |
117 | ## LLM tools integration examples
118 |
119 | - Every model directory contains codes to add OpenAI compatible endpoints to the BentoML service.
120 | - [outlines-integration/](outlines-integration/) contains the code to integrate with [outlines](https://github.com/outlines-dev/outlines) for structured generation.
121 |
--------------------------------------------------------------------------------
/llama3.1-70b-instruct-awq/bentofile.yaml:
--------------------------------------------------------------------------------
1 | service: "service:LMDeploy"
2 | labels:
3 | owner: bentoml-team
4 | stage: demo
5 | include:
6 | - "*.py"
7 | python:
8 | requirements_txt: "./requirements.txt"
9 | lock_packages: false
10 |
--------------------------------------------------------------------------------
/llama3.1-70b-instruct-awq/import_model.py:
--------------------------------------------------------------------------------
1 | import bentoml
2 |
3 | MODEL_ID = "hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4"
4 | BENTO_MODEL_TAG = MODEL_ID.lower().replace("/", "--")
5 |
6 | def import_model(model_id, bento_model_tag):
7 |
8 | import torch
9 | from transformers import AutoModelForCausalLM, AutoTokenizer
10 |
11 | tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
12 | model = AutoModelForCausalLM.from_pretrained(
13 | MODEL_ID,
14 | torch_dtype=torch.float16,
15 | low_cpu_mem_usage=True,
16 | )
17 |
18 | with bentoml.models.create(bento_model_tag) as bento_model_ref:
19 | tokenizer.save_pretrained(bento_model_ref.path)
20 | model.save_pretrained(bento_model_ref.path)
21 |
22 |
23 | if __name__ == "__main__":
24 | import_model(MODEL_ID, BENTO_MODEL_TAG)
25 |
--------------------------------------------------------------------------------
/llama3.1-70b-instruct-awq/openai_endpoints.py:
--------------------------------------------------------------------------------
1 | import fastapi
2 | import lmdeploy.serve.openai.api_server as lmdeploy_api_server
3 |
4 | openai_api_app = fastapi.FastAPI()
5 |
6 | OPENAI_ENDPOINTS = [
7 | ["/chat/completions", lmdeploy_api_server.chat_completions_v1, ["POST"]],
8 | ["/completions", lmdeploy_api_server.completions_v1, ["POST"]],
9 | ["/models", lmdeploy_api_server.available_models, ["GET"]],
10 | ]
11 |
12 | for route, endpoint, methods in OPENAI_ENDPOINTS:
13 | openai_api_app.add_api_route(
14 | path=route,
15 | endpoint=endpoint,
16 | methods=methods,
17 | include_in_schema=True,
18 | )
19 |
--------------------------------------------------------------------------------
/llama3.1-70b-instruct-awq/requirements.txt:
--------------------------------------------------------------------------------
1 | autoawq==0.2.5
2 | bentoml>=1.3.0
3 | fastapi==0.111.1
4 | lmdeploy==0.5.2.post1
5 |
--------------------------------------------------------------------------------
/llama3.1-70b-instruct-awq/service.py:
--------------------------------------------------------------------------------
1 | import uuid
2 | from typing import AsyncGenerator, Optional
3 |
4 | import bentoml
5 | from annotated_types import Ge, Le
6 | from typing_extensions import Annotated
7 |
8 | from openai_endpoints import openai_api_app
9 |
10 |
11 | MAX_TOKENS = 1024
12 | SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
13 |
14 | If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""
15 |
16 | PROMPT_TEMPLATE = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
17 |
18 | {system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>
19 |
20 | {user_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
21 |
22 | """
23 |
24 | MODEL_ID = "hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4"
25 | BENTO_MODEL_TAG = MODEL_ID.lower().replace("/", "--")
26 |
27 |
28 | @bentoml.mount_asgi_app(openai_api_app, path="/v1")
29 | @bentoml.service(
30 | name="bentolmdeploy-llama3.1-70b-instruct-awq-service",
31 | traffic={
32 | "timeout": 300,
33 | },
34 | resources={
35 | "gpu": 1,
36 | "gpu_type": "nvidia-a100-80gb",
37 | },
38 | )
39 | class LMDeploy:
40 |
41 | bento_model_ref = bentoml.models.get(BENTO_MODEL_TAG)
42 |
43 | def __init__(self) -> None:
44 | from transformers import AutoTokenizer
45 | from lmdeploy.serve.async_engine import AsyncEngine
46 | from lmdeploy.messages import TurbomindEngineConfig
47 |
48 | engine_config = TurbomindEngineConfig(
49 | model_name=MODEL_ID,
50 | model_format="awq",
51 | cache_max_entry_count=0.85,
52 | enable_prefix_caching=True,
53 | )
54 | self.engine = AsyncEngine(
55 | self.bento_model_ref.path, backend_config=engine_config
56 | )
57 |
58 | import lmdeploy.serve.openai.api_server as lmdeploy_api_sever
59 | lmdeploy_api_sever.VariableInterface.async_engine = self.engine
60 |
61 | tokenizer = AutoTokenizer.from_pretrained(self.bento_model_ref.path)
62 | self.stop_tokens = [
63 | tokenizer.convert_ids_to_tokens(
64 | tokenizer.eos_token_id,
65 | ),
66 | "<|eot_id|>",
67 | ]
68 |
69 |
70 | @bentoml.api
71 | async def generate(
72 | self,
73 | ctx: bentoml.Context,
74 | prompt: str = "Explain superconductors in plain English",
75 | system_prompt: Optional[str] = SYSTEM_PROMPT,
76 | max_tokens: Annotated[int, Ge(128), Le(MAX_TOKENS)] = MAX_TOKENS,
77 | ) -> AsyncGenerator[str, None]:
78 |
79 | from lmdeploy import GenerationConfig
80 |
81 | gen_config = GenerationConfig(
82 | max_new_tokens=max_tokens, stop_words=self.stop_tokens,
83 | )
84 |
85 | if system_prompt is None:
86 | system_prompt = SYSTEM_PROMPT
87 | prompt = PROMPT_TEMPLATE.format(user_prompt=prompt, system_prompt=system_prompt)
88 |
89 | session_id = abs(uuid.uuid4().int >> 96)
90 | stream = self.engine.generate(
91 | prompt, session_id=session_id, gen_config=gen_config
92 | )
93 |
94 | async for request_output in stream:
95 | if await ctx.request.is_disconnected():
96 | await self.engine.stop_session(session_id)
97 | await self.engine.end_session(session_id)
98 | return
99 | yield request_output.response
100 |
--------------------------------------------------------------------------------
/llama3.1-8b-instruct/.bentoignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | *.py[cod]
3 | *$py.class
4 | .ipynb_checkpoints
5 | venv/
6 |
--------------------------------------------------------------------------------
/llama3.1-8b-instruct/README.md:
--------------------------------------------------------------------------------
1 |
2 |
Self-host LLMs with LMDeploy and BentoML
3 |
4 |
5 | This is a BentoML example project, showing you how to serve and deploy open-source Large Language Models (LLMs) using [LMDeploy](https://github.com/InternLM/lmdeploy), a toolkit for compressing, deploying, and serving LLMs.
6 |
7 | See [here](https://docs.bentoml.com/en/latest/examples/overview.html) for a full list of BentoML example projects.
8 |
9 | 💡 This example is served as a basis for advanced code customization, such as custom model, inference logic or LMDeploy options. For simple LLM hosting with OpenAI compatible endpoint without writing any code, see [OpenLLM](https://github.com/bentoml/OpenLLM).
10 |
11 | ## Prerequisites
12 |
13 | - You have installed Python 3.8+ and `pip`. See the [Python downloads page](https://www.python.org/downloads/) to learn more.
14 | - You have a basic understanding of key concepts in BentoML, such as Services. We recommend you read [Quickstart](https://docs.bentoml.com/en/1.2/get-started/quickstart.html) first.
15 | - If you want to test the Service locally, you need a Nvidia GPU with at least 20G VRAM.
16 | - This example uses Llama 3 8B Instruct. Make sure you have [gained access to the model](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct).
17 | - (Optional) We recommend you create a virtual environment for dependency isolation for this project. See the [Conda documentation](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html) or the [Python documentation](https://docs.python.org/3/library/venv.html) for details.
18 |
19 | ## Install dependencies
20 |
21 | ```bash
22 | git clone https://github.com/bentoml/BentoLMDeploy.git
23 | cd BentoLMDeploy/llama3-8b-instruct
24 | pip install -r requirements.txt
25 | ```
26 |
27 | ## Download the model
28 |
29 | Run the script to download Llama 3.
30 |
31 | ```bash
32 | python import_model.py
33 | ```
34 |
35 | ## Run the BentoML Service
36 |
37 | We have defined a BentoML Service in `service.py`. Run `bentoml serve` in your project directory to start the Service.
38 |
39 | ```bash
40 | $ bentoml serve .
41 |
42 | 2024-05-04T17:24:01+0800 [INFO] [cli] Starting production HTTP BentoServer from "service:LMDeploy" listening on http://localhost:3000 (Press CTRL+C to quit)
43 | 2024-05-04 17:24:03,239 - lmdeploy - INFO - input backend=turbomind, backend_config=TurbomindEngineConfig(model_name='meta-llama/Meta-Llama-3-8B-Instruct', model_format='hf', tp=1, session_len=None, max_batch_size=128, cach
44 | e_max_entry_count=0.9, cache_block_seq_len=64, quant_policy=0, rope_scaling_factor=0.0, use_logn_attn=False, download_dir=None, revision=None, max_prefill_token_num=8192, num_tokens_per_iter=0, max_prefill_iters=1)
45 | 2024-05-04 17:24:03,240 - lmdeploy - INFO - input chat_template_config=None
46 | 2024-05-04 17:24:03,339 - lmdeploy - INFO - updated chat_template_onfig=ChatTemplateConfig(model_name='llama3', system=None, meta_instruction=None, eosys=None, user=None, eoh=None, assistant=None, eoa=None, separator=None,
47 | capability=None, stop_words=None)
48 | 2024-05-04 17:24:03,359 - lmdeploy - WARNING - model_source: hf_model
49 | 2024-05-04 17:24:03,359 - lmdeploy - WARNING - model_name is deprecated in TurbomindEngineConfig and has no effect
50 | Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
51 | Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
52 | 2024-05-04 17:24:03,727 - lmdeploy - WARNING - model_config:
53 |
54 | ...
55 | ```
56 |
57 | The server is now active at [http://localhost:3000](http://localhost:3000/). You can interact with it using the Swagger UI or in other different ways.
58 |
59 |
60 |
61 | CURL
62 |
63 | ```bash
64 | curl -X 'POST' \
65 | 'http://localhost:3000/generate' \
66 | -H 'accept: text/event-stream' \
67 | -H 'Content-Type: application/json' \
68 | -d '{
69 | "prompt": "Explain superconductors like I'\''m five years old",
70 | "max_tokens": 1024
71 | }'
72 | ```
73 |
74 |
75 |
76 |
77 |
78 | Python client
79 |
80 | ```python
81 | import bentoml
82 |
83 | with bentoml.SyncHTTPClient("http://localhost:3000") as client:
84 | response_generator = client.generate(
85 | prompt="Explain superconductors like I'm five years old",
86 | max_tokens=1024
87 | )
88 | for response in response_generator:
89 | print(response, end='')
90 | ```
91 |
92 |
93 |
94 | ## Deploy to BentoCloud
95 |
96 | After the Service is ready, you can deploy the application to BentoCloud for better management and scalability. [Sign up](https://www.bentoml.com/) if you haven't got a BentoCloud account.
97 |
98 | Make sure you have [logged in to BentoCloud](https://docs.bentoml.com/en/latest/bentocloud/how-tos/manage-access-token.html), then run the following command to deploy it.
99 |
100 | ```bash
101 | bentoml deploy .
102 | ```
103 |
104 | Once the application is up and running on BentoCloud, you can access it via the exposed URL.
105 |
106 | **Note**: For custom deployment in your own infrastructure, use [BentoML to generate an OCI-compliant image](https://docs.bentoml.com/en/latest/guides/containerization.html).
107 |
--------------------------------------------------------------------------------
/llama3.1-8b-instruct/requirements.txt:
--------------------------------------------------------------------------------
1 | bentoml>=1.4.3
2 | fastapi==0.115.6
3 | lmdeploy==0.7.1
4 |
--------------------------------------------------------------------------------
/llama3.1-8b-instruct/service.py:
--------------------------------------------------------------------------------
1 | import uuid
2 | from typing import AsyncGenerator, Optional
3 |
4 | import bentoml
5 | import fastapi
6 | from annotated_types import Ge, Le
7 | from typing_extensions import Annotated
8 |
9 | openai_api_app = fastapi.FastAPI()
10 |
11 |
12 | MAX_SESSION_LEN = 2048
13 | MAX_TOKENS = 1024
14 | SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
15 |
16 | If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""
17 |
18 | MODEL_ID = "meta-llama/Meta-Llama-3.1-8B-Instruct"
19 |
20 |
21 | @bentoml.asgi_app(openai_api_app, path="/v1")
22 | @bentoml.service(
23 | name="bentolmdeploy-llama3.1-8b-insruct-service",
24 | image=bentoml.images.PythonImage(python_version="3.11").requirements_file("requirements.txt"),
25 | traffic={
26 | "timeout": 300,
27 | },
28 | resources={
29 | "gpu": 1,
30 | "gpu_type": "nvidia-l4",
31 | },
32 | )
33 | class LMDeploy:
34 | hf_model = bentoml.models.HuggingFaceModel(MODEL_ID)
35 |
36 | def __init__(self) -> None:
37 | from transformers import AutoTokenizer
38 | from lmdeploy import ChatTemplateConfig
39 | from lmdeploy.serve.async_engine import AsyncEngine
40 | from lmdeploy.messages import TurbomindEngineConfig
41 |
42 | engine_config = TurbomindEngineConfig(
43 | model_name=MODEL_ID,
44 | model_format="hf",
45 | cache_max_entry_count=0.9,
46 | enable_prefix_caching=True,
47 | session_len=MAX_SESSION_LEN,
48 | )
49 | self.engine = AsyncEngine(
50 | self.hf_model,
51 | backend_config=engine_config,
52 | model_name=MODEL_ID,
53 | chat_template_config=ChatTemplateConfig("llama3_1"),
54 | )
55 |
56 | self.tokenizer = AutoTokenizer.from_pretrained(self.hf_model)
57 | self.stop_tokens = [
58 | self.tokenizer.convert_ids_to_tokens(
59 | self.tokenizer.eos_token_id,
60 | ),
61 | "<|eot_id|>",
62 | ]
63 |
64 | import lmdeploy.serve.openai.api_server as lmdeploy_api_server
65 | lmdeploy_api_server.VariableInterface.async_engine = self.engine
66 |
67 | OPENAI_ENDPOINTS = [
68 | ["/chat/completions", lmdeploy_api_server.chat_completions_v1, ["POST"]],
69 | ["/completions", lmdeploy_api_server.completions_v1, ["POST"]],
70 | ["/models", lmdeploy_api_server.available_models, ["GET"]],
71 | ]
72 |
73 | for route, endpoint, methods in OPENAI_ENDPOINTS:
74 | openai_api_app.add_api_route(
75 | path=route,
76 | endpoint=endpoint,
77 | methods=methods,
78 | include_in_schema=True,
79 | )
80 |
81 |
82 | @bentoml.api
83 | async def generate(
84 | self,
85 | ctx: bentoml.Context,
86 | prompt: str = "Explain superconductors in plain English",
87 | system_prompt: Optional[str] = SYSTEM_PROMPT,
88 | max_tokens: Annotated[int, Ge(128), Le(MAX_TOKENS)] = MAX_TOKENS,
89 | ) -> AsyncGenerator[str, None]:
90 |
91 | from lmdeploy import GenerationConfig
92 |
93 | gen_config = GenerationConfig(
94 | max_new_tokens=max_tokens, stop_words=self.stop_tokens,
95 | )
96 |
97 | if system_prompt is None:
98 | system_prompt = SYSTEM_PROMPT
99 |
100 | messages = [
101 | dict(role="system", content=system_prompt),
102 | dict(role="user", content=prompt),
103 | ]
104 |
105 | prompt = self.tokenizer.apply_chat_template(
106 | conversation=messages,
107 | add_generation_prompt=True,
108 | tokenize=False,
109 | )
110 |
111 | session_id = abs(uuid.uuid4().int >> 96)
112 | stream = self.engine.generate(
113 | prompt, session_id=session_id, gen_config=gen_config
114 | )
115 |
116 | async for request_output in stream:
117 | if await ctx.request.is_disconnected():
118 | await self.engine.stop_session(session_id)
119 | await self.engine.end_session(session_id)
120 | return
121 | yield request_output.response
122 |
--------------------------------------------------------------------------------
/mistral-7b-instruct/.bentoignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | *.py[cod]
3 | *$py.class
4 | .ipynb_checkpoints
5 | venv/
6 |
--------------------------------------------------------------------------------
/mistral-7b-instruct/README.md:
--------------------------------------------------------------------------------
1 |
2 |
Self-host LLMs with LMDeploy and BentoML
3 |
4 |
5 | This is a BentoML example project, showing you how to serve and deploy open-source Large Language Models (LLMs) using [LMDeploy](https://github.com/InternLM/lmdeploy), a toolkit for compressing, deploying, and serving LLMs.
6 |
7 | See [here](https://docs.bentoml.com/en/latest/examples/overview.html) for a full list of BentoML example projects.
8 |
9 | 💡 This example is served as a basis for advanced code customization, such as custom model, inference logic or LMDeploy options. For simple LLM hosting with OpenAI compatible endpoint without writing any code, see [OpenLLM](https://github.com/bentoml/OpenLLM).
10 |
11 | ## Prerequisites
12 |
13 | - You have installed Python 3.8+ and `pip`. See the [Python downloads page](https://www.python.org/downloads/) to learn more.
14 | - You have a basic understanding of key concepts in BentoML, such as Services. We recommend you read [Quickstart](https://docs.bentoml.com/en/1.2/get-started/quickstart.html) first.
15 | - If you want to test the Service locally, you need a Nvidia GPU with at least 24G VRAM.
16 | - This example uses Mistral 7B Instruct. Make sure you have [gained access to the model](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2).
17 | - (Optional) We recommend you create a virtual environment for dependency isolation for this project. See the [Conda documentation](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html) or the [Python documentation](https://docs.python.org/3/library/venv.html) for details.
18 |
19 | ## Install dependencies
20 |
21 | ```bash
22 | git clone https://github.com/bentoml/BentoLMDeploy.git
23 | cd BentoLMDeploy/mistral-7b-instruct
24 | pip install -r requirements.txt
25 | ```
26 |
27 | ## Run the BentoML Service
28 |
29 | We have defined a BentoML Service in `service.py`. Run `bentoml serve` in your project directory to start the Service.
30 |
31 | ```bash
32 | $ bentoml serve .
33 |
34 | 2024-07-05T23:57:36+0800 [INFO] [cli] Starting production HTTP BentoServer from "service:LMDeploy" listening on http://localhost:3000 (Press CTRL+C to quit)
35 | 2024-07-05 23:57:38,582 - lmdeploy - INFO - input backend=turbomind, backend_config=TurbomindEngineConfig(model_name='mistralai/Mistral-7B-Instruct-v0.2', model_format='hf', tp=1, session_len=N
36 | one, max_batch_size=128, cache_max_entry_count=0.95, cache_block_seq_len=64, enable_prefix_caching=False, quant_policy=0, rope_scaling_factor=0.0, use_logn_attn=False, download_dir=None, revisi
37 | on=None, max_prefill_token_num=8192, num_tokens_per_iter=0, max_prefill_iters=1)
38 | 2024-07-05 23:57:38,582 - lmdeploy - INFO - input chat_template_config=None
39 | 2024-07-05 23:57:38,616 - lmdeploy - INFO - updated chat_template_onfig=ChatTemplateConfig(model_name='mistral', system=None, meta_instruction=None, eosys=None, user=None, eoh=None, assistant=N
40 | one, eoa=None, separator=None, capability=None, stop_words=None)
41 | 2024-07-05 23:57:38,652 - lmdeploy - INFO - model_source: hf_model
42 |
43 | ...
44 | ```
45 |
46 | The server is now active at [http://localhost:3000](http://localhost:3000/). You can interact with it using the Swagger UI or in other different ways.
47 |
48 |
49 |
50 | CURL
51 |
52 | ```bash
53 | curl -X 'POST' \
54 | 'http://localhost:3000/generate' \
55 | -H 'accept: text/event-stream' \
56 | -H 'Content-Type: application/json' \
57 | -d '{
58 | "prompt": "Explain superconductors like I'\''m five years old",
59 | "max_tokens": 1024
60 | }'
61 | ```
62 |
63 |
64 |
65 |
66 |
67 | Python client
68 |
69 | ```python
70 | import bentoml
71 |
72 | with bentoml.SyncHTTPClient("http://localhost:3000") as client:
73 | response_generator = client.generate(
74 | prompt="Explain superconductors like I'm five years old",
75 | max_tokens=1024
76 | )
77 | for response in response_generator:
78 | print(response, end='')
79 | ```
80 |
81 |
82 |
83 | ## Deploy to BentoCloud
84 |
85 | After the Service is ready, you can deploy the application to BentoCloud for better management and scalability. [Sign up](https://www.bentoml.com/) if you haven't got a BentoCloud account.
86 |
87 | Make sure you have [logged in to BentoCloud](https://docs.bentoml.com/en/latest/bentocloud/how-tos/manage-access-token.html), then run the following command to deploy it.
88 |
89 | ```bash
90 | bentoml deploy .
91 | ```
92 |
93 | Once the application is up and running on BentoCloud, you can access it via the exposed URL.
94 |
95 | **Note**: For custom deployment in your own infrastructure, use [BentoML to generate an OCI-compliant image](https://docs.bentoml.com/en/latest/guides/containerization.html).
96 |
--------------------------------------------------------------------------------
/mistral-7b-instruct/bentofile.yaml:
--------------------------------------------------------------------------------
1 | service: "service:LMDeploy"
2 | labels:
3 | owner: bentoml-team
4 | stage: demo
5 | include:
6 | - "*.py"
7 | python:
8 | requirements_txt: "./requirements.txt"
9 | lock_packages: false
10 | envs:
11 | - name: HF_TOKEN
12 |
--------------------------------------------------------------------------------
/mistral-7b-instruct/requirements.txt:
--------------------------------------------------------------------------------
1 | bentoml>=1.3.0
2 | lmdeploy==0.5.1
3 |
--------------------------------------------------------------------------------
/mistral-7b-instruct/service.py:
--------------------------------------------------------------------------------
1 | import uuid
2 | from typing import AsyncGenerator, Optional
3 |
4 | import bentoml
5 | from annotated_types import Ge, Le
6 | from typing_extensions import Annotated
7 |
8 |
9 | MAX_TOKENS = 1024
10 | SYSTEM_PROMPT = """
11 | You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
12 |
13 | If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
14 | """
15 |
16 | PROMPT_TEMPLATE = """[INST]
17 | {system_prompt}
18 | {user_prompt} [/INST] """
19 |
20 | MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.2"
21 |
22 |
23 | @bentoml.service(
24 | name="bentolmdeploy-mistral-7b-insruct-service-benchmark",
25 | traffic={
26 | "timeout": 300,
27 | },
28 | resources={
29 | "gpu": 1,
30 | "gpu_type": "nvidia-l4",
31 | },
32 | )
33 | class LMDeploy:
34 |
35 | def __init__(self) -> None:
36 | from transformers import AutoTokenizer
37 | from lmdeploy.serve.async_engine import AsyncEngine
38 | from lmdeploy.messages import TurbomindEngineConfig
39 |
40 | engine_config = TurbomindEngineConfig(
41 | model_name=MODEL_ID,
42 | model_format="hf",
43 | cache_max_entry_count=0.95,
44 | )
45 | self.engine = AsyncEngine(
46 | MODEL_ID, backend_config=engine_config
47 | )
48 |
49 |
50 | @bentoml.api
51 | async def generate(
52 | self,
53 | ctx: bentoml.Context,
54 | prompt: str = "Explain superconductors in plain English",
55 | system_prompt: Optional[str] = SYSTEM_PROMPT,
56 | max_tokens: Annotated[int, Ge(128), Le(MAX_TOKENS)] = MAX_TOKENS,
57 | ) -> AsyncGenerator[str, None]:
58 |
59 | from lmdeploy import GenerationConfig
60 |
61 | gen_config = GenerationConfig(max_new_tokens=max_tokens)
62 |
63 | if system_prompt is None:
64 | system_prompt = SYSTEM_PROMPT
65 | prompt = PROMPT_TEMPLATE.format(user_prompt=prompt, system_prompt=system_prompt)
66 |
67 | session_id = abs(uuid.uuid4().int >> 96)
68 | stream = self.engine.generate(
69 | prompt, session_id=session_id, gen_config=gen_config
70 | )
71 |
72 | async for request_output in stream:
73 | if await ctx.request.is_disconnected():
74 | await self.engine.stop_session(session_id)
75 | await self.engine.end_session(session_id)
76 | return
77 | yield request_output.response
78 |
--------------------------------------------------------------------------------