├── .gitignore ├── LICENSE ├── README.md ├── api.py ├── chat_templates ├── alpaca.jinja ├── chatml.jinja └── place_your_templates_here.txt ├── pyvenv.cfg ├── screenshots ├── webui_screenshot.png ├── webui_screenshot1.png ├── webui_screenshot2.png ├── webui_screenshot3.png └── webui_screenshot4.png ├── serve_chat_completions.py ├── serve_completions.py ├── setup.py ├── sliced_llama.py ├── sliced_llama_exl2.py ├── sliced_llama_server.py ├── start.bat └── webui ├── LICENSE-sse.js ├── favicon.ico ├── index.css ├── index.html ├── index.js ├── sse.js └── webui-dragndrop-test.html /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 silphendio 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # sliced_llama 2 | SlicedLlama is AI text generation software powered by [exllamav2](https://github.com/turboderp/exllamav2). 3 | It runs large language models (LLMs) and offers an interface to generate text, adjust model parameters, and modify the model on the fly by rearranging layers. 4 | A webUI is included, but it's also a server compatible with other LLM GUIs. 5 | 6 |  7 | 8 | 9 | ## Features 10 | - Text Completion WebUI 11 | - partly OpenAI-compatible API (this is a work in progress) 12 | - Layer Slicing: Basically instant Franken-self-merges. You don't even need to reload the model (just the cache). 13 | - Top Logprobs: See the top probabilities for each chosen token. This might help with adjusting sampler parameters. 14 | 15 | 16 | ## Installation 17 | - Make sure python is installed, and a CUDA or RocM (Linux only) compatible GPU driver. 18 | - Clone or download this repository. 19 | - Use the setup script. This creates a venv and installs dependencies. 20 | ```bash 21 | git clone --depth=1 https://github.com/silphendio/sliced_llama 22 | cd sliced_llama 23 | python ./setup.py 24 | ``` 25 | 26 | DISCLAIMER: I haven't tested it on windows at all. 27 | 28 | ## Usage 29 | On Linux, just run it with 30 | ```bash 31 | ./sliced_llama_server.py 32 | ``` 33 | On Windows, click `start.bat` instead. (It invokes `.venv\Scripts\python sliced_llama_server.py`) 34 | 35 | This starts the inference server and the webUI. There, you can load models, adjust parameters and do inference. 36 | You can also use command line arguments, e.g.: 37 | ``` 38 | ./sliced_llama_server.py --model ~/path/to/llm-model-exl2/ --context-size 2048 --slices "0-24, 8-32" 39 | ``` 40 | Currently only exl2 models are supported. You can get them from [huggingface](https://huggingface.co/models?sort=trending&search=exl2). 41 | Make sure that the model fits into VRAM, with some extra memory depending on `(context size)² * (number of layers)`, where context size is the number of tokens the model can remember. 42 | 43 | The WebUI currently only supports text completion, so you need to do the prompt formatting yourself. Each model has its preferred prompt format, so look it up. 44 | 45 | ## Compatibility with other apps: 46 | As an alternative to the webUI, the server can also connect to OpenAI-compatible GUIs like [Mikupad](https://github.com/lmg-anon/mikupad) or [SillyTavern](https://github.com/SillyTavern/SillyTavern). 47 | 48 | 49 | - For SillyTavern, select chat completion, and use `http://127.0.0.1:57593/v1` as costum endpoint. 50 | This will not give you many options, but if you change parameters in the WebUI, the inference server should remember them. 51 | You can select different chat templates in the WebUI. You can add more to the `chat_templates` folder. 52 | 53 | ## TODO / missing features 54 | In no particular order: 55 | - configuration file 56 | - LoRA support 57 | - Classifier Free Guidance 58 | - OpenAI API: 59 | - chat completion currently only works with streaming 60 | - `presency_penalty` and `frequency_penalty` aren't supported 61 | - authentication 62 | - usage statistics 63 | - compatibility with TabbyAPI (For better SillyTavern integration) 64 | - merging different models together 65 | - different merging methods 66 | -------------------------------------------------------------------------------- /api.py: -------------------------------------------------------------------------------- 1 | import time 2 | from pydantic import BaseModel, Field 3 | 4 | 5 | class GenerationInput(BaseModel): 6 | prompt: str 7 | 8 | 9 | # OpenAI API 10 | 11 | 12 | 13 | # OpenAI Chat API 14 | 15 | class ChatMessage(BaseModel): 16 | content: str | None = None 17 | # tool_calls # not implemented 18 | role : str = "assistant" 19 | name : str | None = None 20 | 21 | 22 | class ChatRequest(BaseModel): 23 | messages: list[ChatMessage] = [] 24 | model: str 25 | frequency_penalty: float = 0.0 26 | logit_bias: dict[str, float] = {} 27 | logprobs: bool = False 28 | top_logprobs: int = 0 29 | max_tokens: int | None = None 30 | n: int = 1 31 | presence_penalty: float | None = None 32 | #response_format 33 | seed: int | None = None 34 | stop: list[str] | str | None = None 35 | stream: bool = False 36 | temperature: float = 1.0 37 | top_p: float = 1.0 38 | #tools 39 | #tool_choice 40 | #user 41 | 42 | ## OAI chat completions 43 | class TopLogprob(BaseModel): 44 | token: str 45 | logprob: float 46 | bytes: list[int] | None = None 47 | 48 | class ChatCompletionLogProb(BaseModel): 49 | token: str 50 | logprob: float 51 | bytes: list[int] | None = None 52 | top_logprobs: list[TopLogprob] 53 | 54 | class ChatCompletionLogProbs(BaseModel): 55 | content: list[ChatCompletionLogProb] = [] 56 | 57 | class ChatCompletionChoice(BaseModel): 58 | finish_reason: str | None = Field(default=None, description='"stop", "Length", or null') 59 | index : int = Field(default=0, description="n > 1 (multiple choices).") 60 | message: ChatMessage 61 | logprobs: ChatCompletionLogProbs | None = None 62 | 63 | class CompletionUsage(BaseModel): 64 | completion_tokens: int 65 | prompt_tokens: int 66 | total_tokens: int 67 | 68 | class ChatCompletion(BaseModel): 69 | id: str = "1234" 70 | choices: list[ChatCompletionChoice] 71 | created: int = 1234 72 | model: str 73 | system_fingerprint: str = "N.A." 74 | object: str = "chat.completion" 75 | usage: CompletionUsage | None # TODO 76 | 77 | class ChatCompletionChunkChoice(BaseModel): 78 | finish_reason: str | None = Field(default=None, description='"stop", "Length", or null') 79 | index : int = Field(default=0, description="n > 1 (multiple choices).") 80 | delta: ChatMessage 81 | logprobs: ChatCompletionLogProbs | None = None 82 | 83 | 84 | class ChatCompletionChunk(BaseModel): 85 | id: str = "1234" 86 | choices: list[ChatCompletionChunkChoice] 87 | created: int = 1234 88 | model: str 89 | system_fingerprint: str = "N.A." 90 | object: str = "chat.completion" 91 | 92 | 93 | ## OAI completions 94 | class CompletionRequest(BaseModel): 95 | model: str | None = None 96 | prompt: str 97 | stream: bool = Field(default = False) 98 | best_of: int | None = None 99 | echo: bool = False 100 | logit_bias: dict[int,int] | None = None 101 | logprobs: int | None = None 102 | max_tokens: int | None = None 103 | n: int = Field(default=1, description="This will be ignored. Batch processing is not yet implemented.") 104 | frequency_penalty: float | None = None 105 | presence_penalty: float | None = None 106 | seed: int | None = None 107 | stop: str | list[str] | None = None 108 | stream: bool = False 109 | suffix: str = "" 110 | temperature: float | None = None 111 | top_p: float | None = None 112 | user: str | None = None 113 | 114 | 115 | 116 | class CompletionLogProbs(BaseModel): 117 | text_offset: list[int] = [] 118 | token_logprobs: list[float] = [] 119 | tokens: list[str] = [] 120 | top_logprobs: list[dict[str, float]] | None = [] 121 | 122 | class CompletionChoice(BaseModel): 123 | finish_reason: str | None = 'length' # or 'stop 124 | index: int = 0 125 | logprobs: CompletionLogProbs | None = None 126 | text: str = "" 127 | 128 | class Completion(BaseModel): 129 | id: str = "1234" 130 | choices: list[CompletionChoice] 131 | created: int = 1234 132 | model: str 133 | system_fingerprint: str = "this fingerprint does not exist" 134 | object: str = "chat.completion" 135 | usage: CompletionUsage | None = None # TODO: add this & make it mandatory! 136 | 137 | # embeddings 138 | 139 | class EmbeddingsRequest(BaseModel): 140 | input: str | list[str] 141 | model: str | None = None 142 | encoding_format: str = 'float' 143 | user: str = "user1234" 144 | 145 | class Embeddings(BaseModel): 146 | index: int = 0 147 | embedding: list 148 | object: str = 'embedding' 149 | 150 | class PromptRequest: 151 | prompt: str 152 | class ValueResponse(BaseModel): 153 | value: int 154 | 155 | # tabbyAPI stuff 156 | class TabbyTokenEncodeRequest(BaseModel): 157 | add_bos_token: bool = True 158 | encode_special_tokens: bool = True 159 | decode_special_tokens: bool = True 160 | text: str 161 | 162 | class TabbyTokenDecodeRequest(BaseModel): 163 | add_bos_token: bool = True 164 | encode_special_tokens: bool = True 165 | decode_special_tokens: bool = True 166 | tokens: list[int] 167 | 168 | class ModelCardParameters(BaseModel): 169 | """Represents model card parameters.""" 170 | 171 | # Safe to do this since it's guaranteed to fetch a max seq len 172 | # from model_container 173 | max_seq_len: int | None = None 174 | rope_scale: float = 1.0 175 | rope_alpha: float = 1.0 176 | cache_mode: str = "FP16" 177 | prompt_template: str | None = None 178 | num_experts_per_token: int | None = None 179 | #use_cfg: Optional[bool] = None 180 | #draft: Optional["ModelCard"] = None 181 | 182 | 183 | 184 | class ModelInfo(BaseModel): 185 | """Represents a single model card.""" 186 | 187 | id: str = "not available" 188 | object: str = "model" 189 | created: int = 1234 # no idea what time this represents 190 | owned_by: str = "no idea" 191 | #logging: Optional[LogPreferences] = None 192 | parameters: ModelCardParameters | None = None 193 | available_layers: int | None = None 194 | used_layers: list[int] | None = None 195 | 196 | 197 | 198 | 199 | class LoadModelRequest(BaseModel): 200 | name: str | None = Field(default=None, description = "This is actually the path to the folder containing the safetensor and config files. The awkward name is for compatibility with TabbyAPI.") 201 | cache_mode: str = Field(default="FP8", description="use FP8 to save memory, or FP16 for better accuracy") 202 | max_seq_len: int | None = None 203 | layer_list: list[int] | None = None 204 | 205 | 206 | 207 | class LayerRearrangeRequest(BaseModel): 208 | layer_list: list[int] 209 | 210 | # other stuff 211 | class ModelLoadResponse(BaseModel): 212 | success: bool = True 213 | 214 | class LayerRearrangeResponse(BaseModel): 215 | success: bool = True 216 | 217 | class TemplatesResponse(BaseModel): 218 | object: str = "list" 219 | data: list[str] = [] 220 | 221 | class LoadTemplateRequest(BaseModel): 222 | name: str -------------------------------------------------------------------------------- /chat_templates/alpaca.jinja: -------------------------------------------------------------------------------- 1 | {{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }} 2 | 3 | {% for message in messages %} 4 | {% if message['role'] == 'user' %} 5 | ### Instruction: 6 | {{ message['content']|trim -}} 7 | {% if not loop.last %} 8 | 9 | 10 | {% endif %} 11 | {% elif message['role'] == 'assistant' %} 12 | ### Response: 13 | {{ message['content']|trim -}} 14 | {% if not loop.last %} 15 | 16 | 17 | {% endif %} 18 | {% elif message['role'] == 'user_context' %} 19 | ### Input: 20 | {{ message['content']|trim -}} 21 | {% if not loop.last %} 22 | 23 | 24 | {% endif %} 25 | {% endif %} 26 | {% endfor %} 27 | {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %} 28 | ### Response: 29 | {% endif %} 30 | -------------------------------------------------------------------------------- /chat_templates/chatml.jinja: -------------------------------------------------------------------------------- 1 | {% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %} 2 | {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %} 3 | -------------------------------------------------------------------------------- /chat_templates/place_your_templates_here.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/silphendio/sliced_llama/8fc71cd3493dd2eaf4965cc316a1f9550ce9e0c2/chat_templates/place_your_templates_here.txt -------------------------------------------------------------------------------- /pyvenv.cfg: -------------------------------------------------------------------------------- 1 | home = /usr/bin 2 | include-system-site-packages = false 3 | version = 3.11.6 4 | executable = /usr/bin/python3.11 5 | command = /usr/bin/python3 -m venv /home/silphendio/Documents/code/ai/sliced_llama 6 | -------------------------------------------------------------------------------- /screenshots/webui_screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/silphendio/sliced_llama/8fc71cd3493dd2eaf4965cc316a1f9550ce9e0c2/screenshots/webui_screenshot.png -------------------------------------------------------------------------------- /screenshots/webui_screenshot1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/silphendio/sliced_llama/8fc71cd3493dd2eaf4965cc316a1f9550ce9e0c2/screenshots/webui_screenshot1.png -------------------------------------------------------------------------------- /screenshots/webui_screenshot2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/silphendio/sliced_llama/8fc71cd3493dd2eaf4965cc316a1f9550ce9e0c2/screenshots/webui_screenshot2.png -------------------------------------------------------------------------------- /screenshots/webui_screenshot3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/silphendio/sliced_llama/8fc71cd3493dd2eaf4965cc316a1f9550ce9e0c2/screenshots/webui_screenshot3.png -------------------------------------------------------------------------------- /screenshots/webui_screenshot4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/silphendio/sliced_llama/8fc71cd3493dd2eaf4965cc316a1f9550ce9e0c2/screenshots/webui_screenshot4.png -------------------------------------------------------------------------------- /serve_chat_completions.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | from fastapi.encoders import jsonable_encoder 4 | from sse_starlette import EventSourceResponse 5 | from api import * 6 | from sliced_llama import SlicedLLama 7 | from fastapi import APIRouter, HTTPException 8 | import jinja2 9 | import os 10 | 11 | template_folders = [] 12 | 13 | llm : SlicedLLama 14 | 15 | jinja2_env = jinja2.Environment() 16 | default_template_string = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" 17 | chatml_template = jinja2_env.from_string(default_template_string) 18 | chatml_start_str = '<|im_start|>assisstant\n' 19 | chatml_stop_str = '<|im_end|>' 20 | 21 | router = APIRouter() 22 | 23 | @router.post("/v1/chat/completions", response_model=ChatCompletion) 24 | async def oai_chat_completions(req: ChatRequest) -> ChatCompletion | EventSourceResponse: 25 | print("request received: ", req.model_dump()) 26 | print("----------------------------------") 27 | 28 | prompt = chatml_template.render(req.model_dump()) 29 | prompt += chatml_start_str 30 | print(prompt) 31 | print("#######################") 32 | print("prompt in ChatML format:\n" + prompt + "\n") 33 | 34 | # update gen settings (TODO: do this properly?) 35 | llm.gen_settings.__dict__ = dict(llm.gen_settings.__dict__, **req.model_dump()) 36 | 37 | # handle parameters 38 | stop_str = req.stop 39 | if stop_str is str: stop_str = [stop_str] 40 | if stop_str is None: stop_str = [] 41 | stop_str += [chatml_stop_str] 42 | 43 | 44 | if req.stream: 45 | async def event_generator(): 46 | ret_logprobs = ChatCompletionLogProbs() if req.logprobs and req.logprobs > 0 else None 47 | text = '' 48 | finish_reason = None 49 | if not req.max_tokens: req.max_tokens = 99999999 50 | llm.start_stream(prompt, stop_strings=stop_str, logprobs=req.logprobs) 51 | try: 52 | for i in range(req.max_tokens): 53 | ret = llm.stream_next_token() 54 | text += ret.chunk 55 | print(ret.chunk, end='') 56 | if ret.eos: 57 | finish_reason = "stop" 58 | break 59 | 60 | # log probs 61 | if ret_logprobs: 62 | ret_logprobs.content = ChatCompletionLogProb( 63 | text_offset = [0], 64 | logprob = ret.logprob, 65 | token = ret.chunk, 66 | top_logprobs = [ TopLogprob(token=tok, logprob=lp) for tok,lp in ret.top_logprobs.items()], 67 | ) 68 | 69 | stream_chunk_data = ChatCompletionChunk( 70 | model = llm.model_name, 71 | choices = [ChatCompletionChunkChoice( 72 | finish_reason=finish_reason, 73 | delta=ChatMessage(content=ret.chunk), 74 | logprobs=ret_logprobs 75 | )] 76 | ) 77 | yield json.dumps(jsonable_encoder(stream_chunk_data)) # remove class names, fix braces and quotes 78 | await asyncio.sleep(0) 79 | except asyncio.CancelledError: 80 | print(" - STREAM CANCELLED") 81 | 82 | return EventSourceResponse(event_generator()) 83 | 84 | else: # not streaming 85 | raise NotImplementedError # TODO 86 | 87 | # chat templates 88 | 89 | # file utils: these work with folders too 90 | def find_file(name: str, search_path: list[str] = [], ext: str = "") -> str | None: 91 | if not name.endswith(ext): 92 | name += ext 93 | if os.path.exists(name): 94 | return name 95 | for folder in search_path: 96 | if not os.path.isdir(folder): continue 97 | for filename in os.listdir(folder): 98 | if name == filename: 99 | return os.path.join(folder, filename) 100 | 101 | def list_files(search_path: list[str], ext: str = ""): 102 | results = [] 103 | for folder in search_path: 104 | if not os.path.isdir(folder): continue 105 | for filename in os.listdir(folder): 106 | if filename.endswith(ext): 107 | results.append(filename[:-len(ext)]) 108 | return results 109 | 110 | 111 | @router.get("/v1/templates") 112 | @router.get("/v1/template/list") 113 | async def get_templates() -> TemplatesResponse: 114 | print("template_folders", template_folders) 115 | return TemplatesResponse(data= list_files(template_folders, ".jinja")) 116 | 117 | @router.post("/v1/template/switch") 118 | async def switch_template(req: LoadTemplateRequest): 119 | global chatml_template 120 | print(req.name, template_folders) 121 | file_path = find_file(req.name, template_folders, ".jinja") 122 | if file_path == None: 123 | raise HTTPException(status_code=400, detail="template not found") 124 | 125 | with open(file_path, encoding="utf-8") as f: 126 | template_str = f.read() 127 | chatml_template = jinja2_env.from_string(template_str) 128 | 129 | 130 | @router.post("/v1/template/unload", description="This sets the template string to chatml") 131 | def unload_template(): 132 | global chatml_template 133 | chatml_template = jinja2_env.from_string(default_template_string) 134 | -------------------------------------------------------------------------------- /serve_completions.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | from fastapi.encoders import jsonable_encoder 4 | from sse_starlette import EventSourceResponse 5 | from api import * 6 | from sliced_llama import SlicedLLama 7 | from fastapi import APIRouter 8 | 9 | llm : SlicedLLama 10 | 11 | router = APIRouter() 12 | 13 | @router.post("/v1/completions", response_model=CompletionChoice) 14 | async def oai_completions(req: CompletionRequest) -> CompletionChoice | EventSourceResponse: 15 | print("request received: ", req) 16 | 17 | # update gen settings (TODO: do this properly?) 18 | llm.gen_settings.__dict__ = dict(llm.gen_settings.__dict__, **req.model_dump()) 19 | 20 | # handle parameters 21 | stop_str = req.stop 22 | if stop_str is str: stop_str = [stop_str] 23 | if stop_str is None: stop_str = [] 24 | 25 | 26 | if req.stream: 27 | async def event_generator(): 28 | ret_logprobs = CompletionLogProbs() if req.logprobs and req.logprobs > 0 else None 29 | text = '' 30 | finish_reason = None 31 | if not req.max_tokens: req.max_tokens = 99999999 32 | llm.start_stream(req.prompt, stop_strings=stop_str, logprobs=req.logprobs) 33 | try: 34 | for i in range(req.max_tokens): 35 | ret = llm.stream_next_token() 36 | text += ret.chunk 37 | print(ret.chunk, end='') 38 | if ret.eos: 39 | finish_reason = "stop" 40 | break 41 | 42 | # log probs 43 | if ret_logprobs: 44 | ret_logprobs = CompletionLogProbs( 45 | text_offset = [0], 46 | token_logprobs = [ret.logprob], 47 | tokens = [ret.chunk], 48 | top_logprobs = [ret.top_logprobs], 49 | ) 50 | 51 | stream_chunk_data = Completion( 52 | model = llm.model_name, 53 | choices = [CompletionChoice(finish_reason=finish_reason, text=ret.chunk, logprobs=ret_logprobs)] 54 | ) 55 | yield json.dumps(jsonable_encoder(stream_chunk_data)) # remove class names, fix braces and quotes 56 | await asyncio.sleep(0) 57 | except asyncio.CancelledError: 58 | print(" - STREAM CANCELLED") 59 | 60 | return EventSourceResponse(event_generator()) 61 | 62 | else: # not streaming 63 | text = '' 64 | finish_reason = "length" 65 | ret_logprobs = CompletionLogProbs() if req.logprobs and req.logprobs > 0 else None 66 | if not req.max_tokens: req.max_tokens = 99999999 67 | llm.start_stream(req.prompt) 68 | for _ in range(req.max_tokens): 69 | ret = llm.stream_next_token() 70 | 71 | # logprobs 72 | if ret_logprobs: 73 | ret_logprobs.text_offset += [len(text)], 74 | ret_logprobs.token_logprobs += [ret.logprob], 75 | ret_logprobs.tokens = [ret.chunk], 76 | ret_logprobs.top_logprobs = [ret.top_logprobs], 77 | 78 | text += ret.chunk 79 | if ret.eos: 80 | finish_reason = "stop" 81 | break 82 | 83 | # logprobs 84 | 85 | return Completion( 86 | model = llm.model_name, 87 | choices = [CompletionChoice(finish_reason=finish_reason, text=text, logprobs=ret_logprobs)] 88 | ) 89 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from subprocess import run 3 | import sys 4 | import os 5 | 6 | # create venv, try different python commands 7 | python_cmds = ["python", "python3", "py"] 8 | print("creating virtual environment...") 9 | for cmd in python_cmds: 10 | if os.path.isdir(".venv"): 11 | break 12 | try: 13 | run([cmd, "-m", "venv", ".venv"]) 14 | except: pass 15 | if not os.path.isdir(".venv"): 16 | print("failed to create virtual environment.") 17 | 18 | 19 | # determine platform 20 | exl2_version = "0.1.5" 21 | torch_str = "torch2.3.1" 22 | torch_str_fa_linux = "torch2.3" 23 | 24 | fa_version = "2.5.9.post1" 25 | fa_gpu_str = "" 26 | 27 | py_str = "cp" + ''.join(sys.version.split(".")[:2]) 28 | 29 | gpu_str = "" # CUDA or ROCm versions 30 | os_str = "" #"win_amd64" or "linux_x86_64" 31 | 32 | # check for CUDA or ROCm 33 | try: 34 | a = run("nvidia-smi", capture_output=True) 35 | if b'CUDA Version: 12' in a.stdout: 36 | gpu_str = "cu121" 37 | fa_gpu_str = "cu122" 38 | elif b'CUDA Version: 11' in a.stdout: 39 | gpu_str = "cu118" 40 | except: 41 | try: 42 | a = run("rocminfo", capture_output=True) 43 | major_version = int(a.stdout.decode('utf-8').split('Runtime Version')[1].strip().split('.')[0]) 44 | 45 | if major_version == 5: 46 | gpu_str = "rocm5.6" 47 | elif major_version >= 6: 48 | gpu_str = "rocm6.0" 49 | except: 50 | print("no CUDA or ROCm found") 51 | 52 | # determine OS 53 | if sys.platform == 'win32': 54 | os_str = "win_amd64" 55 | if sys.platform == 'linux': 56 | os_str = "linux_x86_64" 57 | 58 | 59 | ## specify requirenments 60 | requirements = [ 61 | "fastapi", 62 | "sse-starlette", 63 | "uvicorn", 64 | "tokenizers", 65 | "setuptools", 66 | ] 67 | 68 | # exllamav2 69 | if os_str and gpu_str: 70 | requirements.append(f"https://github.com/turboderp/exllamav2/releases/download/v{exl2_version}/exllamav2-{exl2_version}+{gpu_str}.{torch_str}-{py_str}-{py_str}-{os_str}.whl") 71 | else: 72 | requirements.append("exllamav2") 73 | 74 | # flash_attn 75 | if gpu_str.startswith("cu") : 76 | if os_str == "win_amd64": 77 | requirements.append(f"https://github.com/bdashore3/flash-attention/releases/download/v{fa_version}/flash_attn-{fa_version}+{fa_gpu_str}{torch_str}cxx11abiFALSE-{py_str}-{py_str}-{os_str}.whl") 78 | 79 | # linux 80 | if os_str == "linux_x86_64": 81 | requirements.append(f"https://github.com/bdashore3/flash-attention/releases/download/v{fa_version}/flash_attn-{fa_version}+{fa_gpu_str}{torch_str_fa_linux}cxx11abiFALSE-{py_str}-{py_str}-{os_str}.whl") 82 | # there is no fallback for flash attention, because it's optional 83 | 84 | 85 | # install requirements 86 | 87 | pip_cmd = "./.venv/Scripts/pip" if sys.platform == 'win32' else "./.venv/bin/pip" 88 | 89 | # pytorch 90 | if gpu_str.startswith("cu") or os_str == "linux_x86_64": 91 | run([pip_cmd, "install", "torch", "--index-url", f"https://download.pytorch.org/whl/{gpu_str}"]) 92 | 93 | for req in requirements: 94 | run([pip_cmd, "install", req]) 95 | -------------------------------------------------------------------------------- /sliced_llama.py: -------------------------------------------------------------------------------- 1 | from typing import Generator 2 | import os 3 | 4 | import api 5 | 6 | 7 | class StreamTokenReturn: 8 | chunk: str = "" 9 | eos: bool = True 10 | logprob: float = 0.0 11 | top_logprobs: dict[str, float] = {} 12 | 13 | from exllamav2.generator import ExLlamaV2Sampler 14 | 15 | GenSettings = ExLlamaV2Sampler.Settings 16 | 17 | class SlicedLLama: 18 | model_name: str = "" 19 | layers: int = 0 20 | is_streaming: bool 21 | gen_settings : GenSettings 22 | 23 | def __init__(self): 24 | raise NotImplementedError 25 | 26 | def load_model(path: str, cache_type: str = "FP16", max_seq_len : int|None = None): 27 | raise NotImplementedError 28 | 29 | def unload_model(): 30 | raise NotImplementedError 31 | 32 | def rearrange_layers(self, layer_list: list[int]): 33 | raise NotImplementedError 34 | 35 | def start_stream(self, prompt: str, stop_strings: list[str]=[], logprobs = 0, **kwargs): 36 | raise NotImplementedError 37 | 38 | def stream_next_token(self) -> StreamTokenReturn: 39 | raise NotImplementedError 40 | 41 | def stop_stream(self): 42 | raise NotImplementedError 43 | 44 | 45 | def generate_response(self, max_length: int = 9999999, stop_strings: list[str] = []) -> str: 46 | raise NotImplementedError 47 | 48 | def tokenize(self, text: str, add_bos_token: bool=True, 49 | encode_special_tokens: bool=True, decode_special_tokens: bool=True) -> list[int]: 50 | raise NotImplementedError 51 | 52 | def decode_tokens(self, tokens: list[int], add_bos_token: bool=True, 53 | encode_special_tokens: bool=True, decode_special_tokens: bool=True) -> str: 54 | raise NotImplementedError 55 | 56 | def create_embeddings(self, text) -> list[float]: 57 | raise NotImplementedError 58 | 59 | def count_tokens(self, text) -> int: 60 | return len(self.tokenize(text)) 61 | -------------------------------------------------------------------------------- /sliced_llama_exl2.py: -------------------------------------------------------------------------------- 1 | import gc 2 | from exllamav2 import * 3 | from exllamav2.generator import * 4 | from exllamav2.module import ExLlamaV2Module 5 | import sys, torch 6 | from sliced_llama import SlicedLLama, StreamTokenReturn 7 | from typing import Generator 8 | import os 9 | from copy import copy 10 | import math 11 | 12 | from urllib.parse import urlparse # to allow loading from path urls 13 | 14 | #from streaming2 import ExLlamaV2StreamingGenerator2 15 | 16 | 17 | class SlicedLLamaExl2(SlicedLLama): 18 | config: ExLlamaV2Config 19 | model: ExLlamaV2 20 | tokenizer: ExLlamaV2Tokenizer 21 | generator: ExLlamaV2StreamingGenerator 22 | gen_settings: ExLlamaV2Sampler.Settings 23 | is_streaming: bool = False 24 | original_modules: list[ExLlamaV2Module] # for repeated layer rearranging 25 | available_layers: int = 0 26 | cache : ExLlamaV2Cache | ExLlamaV2Cache_8bit 27 | 28 | # extra config 29 | n_logprobs: int = 0 30 | 31 | def __init__(self): 32 | pass 33 | 34 | def load_model(self, model_path: str, cache_type: str = "FP16", max_seq_len : int|None = None): 35 | if hasattr(self, 'model'): 36 | self.model.modules = self.original_modules 37 | self.model.unload() 38 | self.cache = None 39 | 40 | try: model_path = urlparse(model_path).path 41 | except: pass 42 | 43 | self.config = ExLlamaV2Config() 44 | self.config.model_dir = model_path 45 | self.config.prepare() 46 | if max_seq_len: 47 | self.config.max_seq_len = max_seq_len 48 | self.model = ExLlamaV2(self.config) 49 | self.original_modules = self.model.modules 50 | self.available_layers = self.config.num_hidden_layers 51 | 52 | if cache_type == "FP16": 53 | self.cache = ExLlamaV2Cache(self.model, lazy = True) 54 | else: 55 | self.cache = ExLlamaV2Cache_8bit(self.model, lazy = True) 56 | 57 | self.model.load_autosplit(self.cache) 58 | 59 | 60 | self.tokenizer = ExLlamaV2Tokenizer(self.config) 61 | self.generator = ExLlamaV2StreamingGenerator(self.model, self.cache, self.tokenizer) 62 | self.gen_settings = ExLlamaV2Sampler.Settings() 63 | 64 | 65 | def rearrange_layers(self, layer_list: list[int]): 66 | if len(layer_list) < 1: raise ValueError 67 | if any([x < 0 or x >= self.available_layers for x in layer_list]): 68 | print("Layer index out of range!") 69 | raise ValueError 70 | # modules arangement: [embedding, [...layers], rms-norm, head] 71 | # where each layer is attention, mlp 72 | self.model.modules = self.original_modules[:1] 73 | for i, idx in enumerate(layer_list): 74 | self.model.modules += [copy(self.original_modules[idx*2 + 1])] 75 | self.model.modules[-1].layer_idx = i # use different cache for copied layer 76 | self.model.modules += [copy(self.original_modules[idx*2 + 2])] 77 | self.model.modules += self.original_modules[-2:] 78 | self.model.head_layer_idx = len(self.model.modules) -1 79 | self.model.config.num_hidden_layers = len(layer_list) 80 | self.model.last_kv_layer_idx = len(self.model.modules) -4 81 | 82 | # reload cache 83 | print("deleting old cache...") 84 | Cache = type(self.cache) 85 | del self.cache 86 | print("creating new cache...") 87 | self.model.cache_map = {} 88 | self.model.set_cache_map() 89 | self.cache = Cache(self.model) 90 | self.generator = ExLlamaV2StreamingGenerator(self.model, self.cache, self.tokenizer) 91 | self.generator.set_stop_conditions([self.tokenizer.eos_token_id]) 92 | print("layers sucessfully rearranged!") 93 | ids = [id(x) for x in self.cache.key_states] 94 | 95 | 96 | def start_stream(self, prompt: str, stop_strings: list[str]=[], logprobs = 0, **kwargs): 97 | print(self.gen_settings.__dict__) 98 | text_ids = self.tokenizer.encode(prompt, add_bos = True) 99 | #max_length = min(max_length, self.model.config.max_seq_len - text_ids.size(0)) # TODO: rope scaling?? 100 | 101 | self.generator.set_stop_conditions(stop_strings) 102 | self.generator.begin_stream(text_ids, self.gen_settings) 103 | self.is_streaming = True 104 | self.n_logprobs = logprobs if logprobs else 0 105 | self.generator.return_logits = True 106 | 107 | def stream_next_token(self) -> StreamTokenReturn: 108 | ret = StreamTokenReturn() 109 | if not self.is_streaming: 110 | return ret 111 | chunk, eos, token, logits = self.generator.stream() 112 | ret.eos = eos 113 | ret.chunk = chunk 114 | 115 | if self.n_logprobs > 0: 116 | ret.top_logprobs = {} 117 | if token.numel() == 0: 118 | return ret 119 | 120 | token = token.flatten()[0] 121 | 122 | logprobs = logits[0].log_softmax(-1) 123 | ret.logprob = float(logprobs[token]) 124 | top_logprobs = logprobs.topk(self.n_logprobs) 125 | 126 | for (prob, tok) in zip(*top_logprobs): 127 | tok_str = self.tokenizer.decode(tok.reshape(1,1))[0] 128 | # sometimes, two tokens have the same string 129 | if tok_str in ret.top_logprobs: # adding logits together is complicated 130 | ret.top_logprobs[tok_str] = math.log(math.exp(float(prob)) + 131 | math.exp(ret.top_logprobs[tok_str])) 132 | else: 133 | ret.top_logprobs[tok_str] = float(prob) 134 | 135 | return ret 136 | 137 | def stop_stream(self): 138 | self.is_streaming = False 139 | 140 | def tokenize(self, text): 141 | return self.tokenizer.encode(text, add_bos = True).flatten().tolist() 142 | 143 | def generate_response(self, max_length: int = 9999999, stop_strings: list = []) -> str: 144 | text = "" 145 | self.start_stream() 146 | for i in range(max_length): 147 | chunk, eos, _ = self.generator.stream() 148 | text += chunk 149 | if eos: return text 150 | return text 151 | 152 | 153 | def create_embeddings(self, text) -> list[float]: 154 | ids = self.tokenize(text) 155 | return my_sliced_llama.model.forward(ids, return_last_state=True)[1].flatten().tolist() 156 | 157 | def decode_tokens(self, tokens: list[int], add_bos_token: bool=True, 158 | encode_special_tokens: bool=True, decode_special_tokens: bool=True) -> str: 159 | self.tokenizer.decode(torch.tensor(tokens)) 160 | 161 | # test code 162 | 163 | def create_layer_list(slices_str: str) -> list[int]: 164 | try: 165 | layers = [] 166 | for s in slices_str.split(','): 167 | a,b = s.split('-') 168 | layers += list(range(int(a), int(b))) 169 | return layers 170 | 171 | except ValueError: 172 | return [] 173 | 174 | if __name__ == "__main__": 175 | my_sliced_llama : SlicedLLama = SlicedLLamaExl2() 176 | 177 | my_sliced_llama.load_model('TinyLlama-1.1B-Chat-v1.0-3.0bpw-h6-exl2') 178 | print("load #1") 179 | import time 180 | time.sleep(3.0) 181 | my_sliced_llama.load_model('TinyLlama-1.1B-Chat-v1.0-3.0bpw-h6-exl2') 182 | print("load #2") 183 | 184 | layer_list = create_layer_list('0-14, 8-22') 185 | 186 | finish_reason = "length" 187 | text = "" 188 | max_tokens = 512 189 | my_sliced_llama.start_stream("Once upon a time") 190 | for i in range(max_tokens): 191 | chunk, eos, _ = my_sliced_llama.stream_next_token() 192 | text += chunk 193 | print(chunk, end='') 194 | if eos: 195 | finish_reason = "stop" 196 | break 197 | 198 | print("--------------------") 199 | 200 | layer_list = create_layer_list('0-22') 201 | 202 | finish_reason = "length" 203 | text = "" 204 | max_tokens = 512 205 | my_sliced_llama.start_stream("Once upon a time") 206 | for i in range(max_tokens): 207 | chunk, eos, _ = my_sliced_llama.stream_next_token() 208 | text += chunk 209 | print(chunk, end='') 210 | if eos: 211 | finish_reason = "stop" 212 | break 213 | 214 | print("--------------------") 215 | 216 | 217 | 218 | 219 | -------------------------------------------------------------------------------- /sliced_llama_server.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env -S sh -c '"`dirname $0`/.venv/bin/python" "$0" "$@"' 2 | 3 | # run `uvicorn main:app --reload` 4 | import argparse 5 | import webbrowser 6 | import uvicorn 7 | from contextlib import asynccontextmanager 8 | 9 | from fastapi import FastAPI, HTTPException 10 | 11 | from fastapi.middleware.cors import CORSMiddleware 12 | 13 | 14 | from fastapi.staticfiles import StaticFiles 15 | 16 | from api import * 17 | import os 18 | 19 | from sliced_llama import SlicedLLama 20 | from sliced_llama_exl2 import SlicedLLamaExl2 21 | import serve_completions as compl 22 | import serve_chat_completions as chat_compl 23 | 24 | import sys 25 | basedir_path = os.path.abspath(os.path.dirname(sys.argv[0])) 26 | 27 | 28 | llm : SlicedLLama = SlicedLLamaExl2() 29 | server_host = '127.0.0.1' 30 | server_port = 57593 31 | 32 | compl.llm = llm 33 | chat_compl.llm = llm 34 | chat_compl.template_folders = [os.path.join(basedir_path, "chat_templates")] 35 | 36 | 37 | # server stuff 38 | app = FastAPI() 39 | app.add_middleware( 40 | CORSMiddleware, 41 | allow_origins=["*"], 42 | allow_credentials=True, 43 | allow_methods=["*"], 44 | allow_headers=["*"], 45 | ) 46 | app.include_router(compl.router) 47 | app.include_router(chat_compl.router) 48 | 49 | 50 | @app.get("/v1/models") 51 | def get_models(): 52 | return { 53 | "object": "list", 54 | "data": [ 55 | { 56 | "id": llm.model_name, 57 | "object": "model", 58 | "created": 0, # too lazy to look up a real number 59 | "owned_by": "dunno" 60 | }, 61 | ] 62 | } 63 | 64 | 65 | # tabbyAPI stuff 66 | @app.post('/v1/token/encode') 67 | def tabby_encode_token(req: TabbyTokenEncodeRequest) -> list[int]: 68 | return llm.tokenize(req.text) 69 | 70 | @app.post('/v1/token/decode') 71 | def tabby_decode_token(req: TabbyTokenDecodeRequest) -> str: 72 | return llm.decode_tokens(req.tokens) 73 | 74 | # TODO 75 | @app.get('/v1/internal/model/info') 76 | def get_model_info() -> ModelInfo: 77 | print(llm) 78 | model_card = ModelInfo() 79 | model_card.id = llm.model_name 80 | model_card.available_layers = llm.available_layers 81 | return model_card 82 | 83 | @app.post('/v1/model/load') 84 | def load_model(req: LoadModelRequest) -> ModelLoadResponse: 85 | if req.name: 86 | try: 87 | llm.load_model(req.name, req.cache_mode, max_seq_len=req.max_seq_len) 88 | 89 | # use folder name as model name 90 | if req.name[-1] in "/\\": 91 | req.name = req.name[:-1] 92 | llm.model_name = os.path.basename(req.name) 93 | return ModelLoadResponse(success=True) 94 | except Exception as e: 95 | raise HTTPException(status_code=400, detail=str(e)) 96 | if req.layer_list: 97 | try: 98 | llm.rearrange_layers(req.layer_list) 99 | except Exception as e: 100 | raise HTTPException(status_code=400, detail=str(e)) 101 | 102 | 103 | # layer slicing 104 | @app.post('/v1/rearrange_layers') 105 | def rearrange_layers(req: LayerRearrangeRequest): 106 | try: 107 | llm.rearrange_layers(req.layer_list) 108 | except Exception as e: 109 | raise HTTPException(status_code=400, detail=str(e)) 110 | 111 | 112 | # from koboldcpp 113 | @app.get('api/extra/tokencount') 114 | def kobold_cpp_tokencount(req): 115 | """for compatability reasons only. Please don't use this""" 116 | return { 'value': llm.tokenizer.count_tokens(req.prompt)} 117 | 118 | 119 | # Barebones WebUI 120 | app.mount("/", StaticFiles(directory= os.path.join(basedir_path, "webui"), html=True)) 121 | 122 | 123 | 124 | def create_layer_list(slices_str: str) -> list[int]: 125 | try: 126 | layers = [] 127 | for s in slices_str.split(','): 128 | a,b = s.split('-') 129 | layers += list(range(int(a), int(b))) 130 | return layers 131 | 132 | except ValueError: 133 | return [] 134 | 135 | if __name__ == "__main__": 136 | parser = argparse.ArgumentParser() 137 | parser.add_argument("-m", "--model", help="path to exl2 folder") 138 | parser.add_argument("-c", "--cache-mode", help="FP8 or FP16", default="FP16") 139 | parser.add_argument("-p", "--port", help="A unique number below 65535", type=int, default=server_port) 140 | parser.add_argument("-s", "--slices", help="layers to use, e.g.: '0-24, 8-32'") 141 | parser.add_argument("-l", "--max-seq-len", "--context-size", help="context size, a lower number saves VRAM", type=int) 142 | args = parser.parse_args() 143 | 144 | if args.model != None: 145 | load_model(LoadModelRequest(name=args.model, cache_mode=args.cache_mode, max_seq_len=args.max_seq_len)) 146 | if args.slices != None: 147 | layer_list = create_layer_list(args.slices) 148 | llm.rearrange_layers(layer_list) 149 | 150 | 151 | server_port = args.port 152 | 153 | webbrowser.open(server_host + ":" + str(server_port)) 154 | 155 | uvicorn.run( 156 | app, 157 | host=server_host, 158 | port=server_port, 159 | log_level="info", 160 | ) 161 | -------------------------------------------------------------------------------- /start.bat: -------------------------------------------------------------------------------- 1 | .venv\Scripts\python sliced_llama_server.py -------------------------------------------------------------------------------- /webui/LICENSE-sse.js: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /webui/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/silphendio/sliced_llama/8fc71cd3493dd2eaf4965cc316a1f9550ce9e0c2/webui/favicon.ico -------------------------------------------------------------------------------- /webui/index.css: -------------------------------------------------------------------------------- 1 | body { 2 | font-size: 1rem; 3 | display: flex; 4 | min-height: 100vh; 5 | flex-direction: column; 6 | flex: 1; 7 | margin: 0 auto; 8 | max-width: 60rem; 9 | padding: 0 0.5rem; 10 | } 11 | select, input, button { 12 | padding: 0.3rem; 13 | margin: 0.3rem; 14 | } 15 | 16 | details { 17 | border: 1px solid; 18 | border-radius: 5px; 19 | padding: 1rem; 20 | } 21 | summary { 22 | font-weight: bold; 23 | } 24 | 25 | 26 | 27 | #tooltip { 28 | position: fixed; 29 | display: inline-block; 30 | background-color: bisque; 31 | z-index: 1; 32 | padding: 5px; 33 | margin: 0 auto; 34 | } 35 | 36 | .token:hover { 37 | background-color:bisque; 38 | } 39 | .token_prob { 40 | background-color:white; 41 | text-align: center; 42 | margin: 5px; 43 | padding: 2px; 44 | border: thin solid; 45 | white-space: nowrap; 46 | float: left; 47 | } 48 | 49 | #completions_area { 50 | border: 1px solid; 51 | border-radius: 5px; 52 | margin-bottom: 1rem; 53 | padding: 1rem; 54 | resize: both; 55 | overflow: auto; 56 | border: thin solid; 57 | white-space: pre-wrap; 58 | 59 | } 60 | 61 | #model_info { 62 | border: thin solid; 63 | padding: 0.2rem; 64 | margin: 0.2rem; 65 | display: block; 66 | } 67 | 68 | @media (prefers-color-scheme: dark) { 69 | #tooltip { 70 | background-color: indigo; 71 | } 72 | .token:hover { 73 | background-color: indigo; 74 | } 75 | .token_prob { 76 | background-color: indigo; 77 | } 78 | } -------------------------------------------------------------------------------- /webui/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 | 5 | 6 | 7 |Or drag it here...
33 |